In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
import torch
import numpy as np # Make sure numpy is imported for potential use
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import nltk
import json
import os
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
nltk.download('punkt_tab')

In [None]:

# Define the path to your congress files
data_path = "../data/processed/house_db/"

# Define congress range (76 to 111 inclusive)
congress_start = 76
congress_end = 111

print(f"Loading congress files from {congress_start} to {congress_end}")

# Initialize empty list to store all dataframes
all_datasets = []

# Loop through each congress number
for congress_num in range(congress_start, congress_end + 1):
    try:
        # Format congress number with leading zeros (e.g., 076, 077, etc.)
        congress_str = f"{congress_num:03d}"

        # Construct file path
        file_path = f"{data_path}house_cleaned_{congress_str}.csv"

        # Load the dataset
        dataset = pd.read_csv(file_path)

        # Add congress number as a column for reference
        dataset['congress_num'] = congress_str

        # Select only the columns we need
        dataset_subset = dataset[["speech", "party", "speech_id", "congress_num"]].copy()

        # Add to our list
        all_datasets.append(dataset_subset)

        print(f"Loaded Congress {congress_str}: {len(dataset_subset)} speeches")

    except FileNotFoundError:
        print(f"File not found: house_cleaned_{congress_str}.csv - skipping")
        continue
    except Exception as e:
        print(f"Error loading Congress {congress_str}: {str(e)}")
        continue

# Combine all datasets into one
if all_datasets:
    combined_dataset = pd.concat(all_datasets, ignore_index=True)

    print(f"\nCombined dataset created!")
    print(f"Total speeches: {len(combined_dataset)}")
    print(f"Columns: {list(combined_dataset.columns)}")
    print(f"Congress numbers included: {sorted(combined_dataset['congress_num'].unique())}")

    # Extract the individual arrays for your analysis
    speeches = combined_dataset["speech"]
    parties = combined_dataset["party"]
    speech_id = combined_dataset["speech_id"]
    congress_nums = combined_dataset["congress_num"]

    print(f"\nData ready for analysis:")
    print(f"  - speeches: {len(speeches)} entries")
    print(f"  - parties: {len(parties)} entries")
    print(f"  - speech_id: {len(speech_id)} entries")
    print(f"  - congress_nums: {len(congress_nums)} entries")

else:
    print("No datasets were successfully loaded!")
    combined_dataset = None

# Step 1: Filter speeches, parties, AND speech_ids to only include valid string entries
valid_indices = []
filtered_speeches = []
filtered_parties = []
filtered_speech_ids = []  # Add this

for idx, speech in enumerate(speeches):
    if isinstance(speech, str) and speech.strip():
        valid_indices.append(idx)
        filtered_speeches.append(speech)
        filtered_parties.append(parties[idx])
        filtered_speech_ids.append(speech_id[idx])  # Add this line
    else:
        print(f"Skipping non-string or empty entry at index {idx}")

# Step 2: Create sentence mapping using original speech_ids
sentence_mapping = []
sentence_counter = 0

for speech_idx, speech in enumerate(filtered_speeches):
    original_speech_id = filtered_speech_ids[speech_idx]  # Get the original speech_id
    sentences_in_speech = sent_tokenize(speech)
    for sent in sentences_in_speech:
        sentence_mapping.append((sentence_counter, original_speech_id))  # Use original speech_id
        sentence_counter += 1

# Flatten the list of sentences from filtered speeches
sentences = [sentence for speech in filtered_speeches for sentence in sent_tokenize(speech)]

print(f"Total sentences created: {len(sentences)}")
print(f"Sentence mapping length: {len(sentence_mapping)}")

# Assuming 'sentences' variable is already a list of strings from your previous data aggregation step
print(f"Type of 'sentences': {type(sentences)}")
# Expected output: Type of 'sentences': <class 'list'>

# 1. Check for and use GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 2. Load the model and move it to the appropriate device
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# 3. Enable float16 precision if on GPU for further speedup and memory efficiency
if device == 'cuda':
    embedding_model.half() # Converts model parameters to float16

# 4. Pre-calculate embeddings with an optimized batch size
# Experiment with batch_size (e.g., 128, 256, 512, 1024) based on your GPU's VRAM.
# Start higher and reduce if you hit memory errors.
print("Calculating embeddings...")
embeddings = embedding_model.encode(
    sentences, # This is already your list of strings
    show_progress_bar=True,
    batch_size=512, # Adjust this value as needed
    convert_to_tensor=True # Keep embeddings on GPU as a PyTorch tensor for potential follow-up GPU operations
)

print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings type: {type(embeddings)}") # Should be torch.Tensor if convert_to_tensor=True


# Optional: If your next steps specifically require a NumPy array on CPU:
embeddings_numpy = embeddings.cpu().numpy()
print(f"Embeddings NumPy shape: {embeddings_numpy.shape}")
print(f"Embeddings NumPy type: {type(embeddings_numpy)}")

# New PCA model:
pca_model = PCA(n_components=40, random_state=42)

# Use bigger sample for tuning
sample_size = 200000
sample_idx = np.random.choice(len(embeddings_numpy), sample_size, replace=False)
sample_embeddings = embeddings_numpy[sample_idx]

# Test different PCA components and k values
test_components = [20, 30, 40, 50]
test_ks = [20, 25, 30, 35]

best_overall_score = -1
best_components = 100
best_k = 40

print("Testing different PCA components and k values...")
results = []

for n_comp in test_components:
    print(f"\nTesting n_components = {n_comp}")

    # Apply PCA with current component count
    pca_temp = PCA(n_components=n_comp, random_state=42)
    pca_sample = pca_temp.fit_transform(sample_embeddings)

    for k in test_ks:
        kmeans_temp = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1000)
        labels = kmeans_temp.fit_predict(pca_sample)
        score = silhouette_score(pca_sample, labels)

        results.append({
            'n_components': n_comp,
            'k': k,
            'score': score
        })

        print(f"  k={k}: score = {score:.4f}")

        if score > best_overall_score:
            best_overall_score = score
            best_components = n_comp
            best_k = k

print(f"\n=== BEST COMBINATION ===")
print(f"Best n_components: {best_components}")
print(f"Best k: {best_k}")
print(f"Best silhouette score: {best_overall_score:.4f}")

# Set the optimal values
optimal_components = best_components
optimal_k = best_k

# Import K-means


kmeans_model = KMeans(
    n_clusters=optimal_k,        # You need to specify number of clusters (start with 15-25)
    random_state=42,      # For reproducibility
    n_init=10,           # Number of random initializations
    max_iter=300,        # Maximum iterations
    algorithm='lloyd'    # Standard K-means algorithm
)

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
client = openai.OpenAI(api_key="")
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}

from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=pca_model, #umap_model
  hdbscan_model=kmeans_model, #hdbscan_model
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=20,
  calculate_probabilities=True,
  verbose=True
)

#topics, probs = topic_model.fit_transform(docs=sentences, embeddings=final_embeddings_for_bertopic)
topics, probs = topic_model.fit_transform(sentences, embeddings_numpy)
topic_model.get_topic_info()
# Step 2: Map sentence topics back to speech topics using original speech_id
speech_topics_by_id = {}  # Use speech_id as key instead of index

for sentence_idx, topic in enumerate(topics):
    original_speech_id = sentence_mapping[sentence_idx][1]  # Get the original speech_id

    if original_speech_id not in speech_topics_by_id:
        speech_topics_by_id[original_speech_id] = []
    speech_topics_by_id[original_speech_id].append(topic)

# Step 3: Determine the most frequent topic for each speech by speech_id
most_frequent_topics_by_id = {}

for speech_id_val, topic_list in speech_topics_by_id.items():
    topic_counts = pd.Series(topic_list).value_counts()
    most_frequent_topic = topic_counts.index[0] if not topic_counts.empty else -1
    most_frequent_topics_by_id[speech_id_val] = most_frequent_topic

# Step 4: Create comprehensive mappings - topic to speech_ids, and topic+party to speech_ids

# First, create a mapping from speech_id to party
speech_id_to_party = {}
for speech_idx in range(len(filtered_speeches)):
    original_speech_id = filtered_speech_ids[speech_idx]
    party = filtered_parties[speech_idx]
    speech_id_to_party[original_speech_id] = party

# Create the basic topic to speech_id mapping
topic_to_speech_id = {}
for speech_id_val, topic in most_frequent_topics_by_id.items():
    if topic not in topic_to_speech_id:
        topic_to_speech_id[topic] = []
    topic_to_speech_id[topic].append(speech_id_val)

# Create the enhanced mapping: topic + party to speech_ids
topic_party_to_speech_id = {}
for speech_id_val, topic in most_frequent_topics_by_id.items():
    if speech_id_val in speech_id_to_party:
        party = speech_id_to_party[speech_id_val]
        key = (topic, party)

        if key not in topic_party_to_speech_id:
            topic_party_to_speech_id[key] = []
        topic_party_to_speech_id[key].append(speech_id_val)

# Helper functions to easily query the data
def get_speech_ids_by_topic(topic_num):
    """Get all speech_ids for a given topic"""
    return topic_to_speech_id.get(topic_num, [])

def get_speech_ids_by_topic_and_party(topic_num, party):
    """Get speech_ids for a specific topic and party combination"""
    return topic_party_to_speech_id.get((topic_num, party), [])

def get_speech_ids_by_party_in_topic(topic_num):
    """Get speech_ids organized by party for a given topic"""
    result = {}
    all_speech_ids = topic_to_speech_id.get(topic_num, [])

    for speech_id_val in all_speech_ids:
        if speech_id_val in speech_id_to_party:
            party = speech_id_to_party[speech_id_val]
            if party not in result:
                result[party] = []
            result[party].append(speech_id_val)

    return result

# Display sample results
print("Sample topic to speech_id mapping:")
for topic_num in list(topic_to_speech_id.keys())[:5]:  # Show first 5 topics
    print(f"Topic {topic_num}: {len(topic_to_speech_id[topic_num])} speeches")
    print(f"  First 5 speech_ids: {topic_to_speech_id[topic_num][:5]}")

print("\nSample topic + party combinations:")
sample_keys = list(topic_party_to_speech_id.keys())[:10]
for key in sample_keys:
    topic_num, party = key
    count = len(topic_party_to_speech_id[key])
    print(f"Topic {topic_num}, Party {party}: {count} speeches")

print("\nExample usage:")
if len(topic_to_speech_id) > 0:
    example_topic = list(topic_to_speech_id.keys())[1] if len(topic_to_speech_id) > 1 else list(topic_to_speech_id.keys())[0]

    print(f"\nTopic {example_topic}:")
    print(f"  All speech_ids: {len(get_speech_ids_by_topic(example_topic))} total")

    party_breakdown = get_speech_ids_by_party_in_topic(example_topic)
    for party, speech_ids in party_breakdown.items():
        print(f"  {party}: {len(speech_ids)} speeches")
        print(f"    Sample speech_ids: {speech_ids[:3]}")

    # Example of getting Republican speeches for this topic
    if 'R' in party_breakdown:
        republican_speeches = get_speech_ids_by_topic_and_party(example_topic, 'R')
        print(f"  Republican speeches in topic {example_topic}: {len(republican_speeches)}")

# If you need the topics_per_class functionality, create aligned arrays
most_frequent_speech_topics_aligned = []
filtered_speech_ids_aligned = []

for speech_idx in range(len(filtered_speeches)):
    original_speech_id = filtered_speech_ids[speech_idx]

    if original_speech_id in most_frequent_topics_by_id:
        topic = most_frequent_topics_by_id[original_speech_id]
        most_frequent_speech_topics_aligned.append(topic)
        filtered_speech_ids_aligned.append(original_speech_id)
    else:
        most_frequent_speech_topics_aligned.append(-1)
        filtered_speech_ids_aligned.append(original_speech_id)

print(f"\nAligned arrays length: {len(most_frequent_speech_topics_aligned)}")
print(f"Filtered speeches length: {len(filtered_speeches)}")

print("\n" + "="*50)
print("USAGE EXAMPLES:")
print("="*50)
print("# Get all speech_ids for topic 1:")
print("speech_ids = get_speech_ids_by_topic(1)")
print("\n# Get Republican speech_ids for topic 1:")
print("republican_speech_ids = get_speech_ids_by_topic_and_party(1, 'R')")
print("\n# Get party breakdown for topic 1:")
print("party_breakdown = get_speech_ids_by_party_in_topic(1)")
print("\n# Access the mappings directly:")
print("topic_to_speech_id[1]  # All speech_ids for topic 1")
print("topic_party_to_speech_id[(1, 'R')]  # Republican speech_ids for topic 1")

topics_per_party_df = topic_model.topics_per_class(filtered_speeches, classes=filtered_parties)
print("Success! topics_per_class completed without error.")
print(topics_per_party_df)


# or ChatGPT's labels
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

topic_model.visualize_topics(custom_labels=True)

In [None]:
import json
import pandas as pd
import os

# ============================================================================
# EXPORT ANALYSIS DATA FOR SUBSEQUENT ANALYSIS
# ============================================================================

# Define output directory
output_dir = "/content/drive/MyDrive/congress-polarization-thesis/outputs/"
os.makedirs(output_dir, exist_ok=True)

# ============================================================================
# 1. EXPORT TOPICS PER PARTY DATAFRAME WITH CUSTOM NAMES
# ============================================================================

# Get the topics_per_party dataframe
topics_per_party_df = topic_model.topics_per_class(filtered_speeches, classes=filtered_parties)

# Add custom topic names column
# Extract the ChatGPT-generated labels that were set using set_topic_labels

if hasattr(topic_model, 'custom_labels_') and topic_model.custom_labels_:
    print("Found ChatGPT custom labels from set_topic_labels")
    custom_names = []
    # Check if custom_labels_ is a list or dictionary
    if isinstance(topic_model.custom_labels_, dict):
        for topic_id in topics_per_party_df.index:
            # Get the custom label, fallback to "Topic X" if not found
            label = topic_model.custom_labels_.get(topic_id, f"Topic {topic_id}")
            custom_names.append(label)
    elif isinstance(topic_model.custom_labels_, list):
         for topic_id in topics_per_party_df.index:
            # Assuming the list is ordered by topic ID, access by index
            if topic_id < len(topic_model.custom_labels_):
                label = topic_model.custom_labels_[topic_id]
            else:
                label = f"Topic {topic_id}" # Fallback
            custom_names.append(label)

    topics_per_party_df['ChatGPT_Label'] = custom_names
    print(f"✓ Added ChatGPT labels to dataframe")
    print(f"  Sample labels: {custom_names[:3]}")


else:
    # Fallback: try to recreate the labels from topic_aspects_ if available
    if hasattr(topic_model, 'topic_aspects_') and 'OpenAI' in topic_model.topic_aspects_:
        print("Recreating ChatGPT labels from topic_aspects_")
        chatgpt_topic_labels = {
            topic: " | ".join(list(zip(*values))[0])
            for topic, values in topic_model.topic_aspects_["OpenAI"].items()
        }
        chatgpt_topic_labels[-1] = "Outlier Topic"

        custom_names = []
        for topic_id in topics_per_party_df.index:
            label = chatgpt_topic_labels.get(topic_id, f"Topic {topic_id}")
            custom_names.append(label)

        topics_per_party_df['ChatGPT_Label'] = custom_names
        print(f"✓ Recreated ChatGPT labels from topic_aspects_")
        print(f"  Sample labels: {custom_names[:3]}")

    else:
        # Final fallback: use default topic representations
        print("No ChatGPT labels found - using default topic representations")
        custom_names = []
        topic_info = topic_model.get_topic_info()
        for topic_id in topics_per_party_df.index:
            topic_row = topic_info[topic_info['Topic'] == topic_id]
            if not topic_row.empty:
                representation = topic_row['Representation'].iloc[0]
                if isinstance(representation, list):
                    label = " | ".join(representation[:3])  # Match your format
                else:
                    label = str(representation)[:100]  # Longer truncation for readability
                custom_names.append(label)
            else:
                custom_names.append(f"Topic {topic_id}")

        topics_per_party_df['Default_Label'] = custom_names
        print(f"✓ Added default topic representations")

print(f"Available columns in topics_per_party_df: {list(topics_per_party_df.columns)}")

# Export topics per party dataframe
topics_csv_path = os.path.join(output_dir, "topics_per_party_analysis.csv")
topics_per_party_df.to_csv(topics_csv_path, index=True)
print(f"✓ Exported topics per party dataframe to: {topics_csv_path}")
print(f"  Shape: {topics_per_party_df.shape}")
print(f"  Columns: {list(topics_per_party_df.columns)}")

# ============================================================================
# 2. EXPORT TOPIC-PARTY TO SPEECH_ID MAPPING
# ============================================================================

# Convert the topic_party_to_speech_id dictionary to a JSON-serializable format
topic_party_mapping_json = {}
for (topic, party), speech_ids in topic_party_to_speech_id.items():
    key = f"topic_{topic}_party_{party}"
    # Convert NumPy int64 to standard Python int
    topic_party_mapping_json[key] = [int(speech_id) for speech_id in speech_ids]

# Export topic-party to speech_id mapping
mapping_json_path = os.path.join(output_dir, "topic_party_to_speech_id_mapping.json")
with open(mapping_json_path, 'w') as f:
    json.dump(topic_party_mapping_json, f, indent=2)
print(f"✓ Exported topic-party mapping to: {mapping_json_path}")
print(f"  Total combinations: {len(topic_party_mapping_json)}")

# ============================================================================
# 3. EXPORT ADDITIONAL USEFUL MAPPINGS
# ============================================================================

# Export basic topic to speech_id mapping (without party)
# Convert NumPy int64 to standard Python int
topic_mapping_json = {str(topic): [int(speech_id) for speech_id in speech_ids] for topic, speech_ids in topic_to_speech_id.items()}
topic_json_path = os.path.join(output_dir, "topic_to_speech_id_mapping.json")
with open(topic_json_path, 'w') as f:
    json.dump(topic_mapping_json, f, indent=2)
print(f"✓ Exported basic topic mapping to: {topic_json_path}")

# Export speech_id to party mapping
# Convert NumPy int64 to standard Python int
speech_party_json_path = os.path.join(output_dir, "speech_id_to_party_mapping.json")
with open(speech_party_json_path, 'w') as f:
    json.dump({int(speech_id): party for speech_id, party in speech_id_to_party.items()}, f, indent=2)
print(f"✓ Exported speech_id to party mapping to: {speech_party_json_path}")

# ============================================================================
# 4. EXPORT COMPREHENSIVE METADATA
# ============================================================================

# Create metadata about the analysis
metadata = {
    "analysis_info": {
        "congress_range": f"{congress_start}-{congress_end}",
        "total_speeches_original": len(speeches),
        "total_speeches_filtered": len(filtered_speeches),
        "total_sentences": len(sentences),
        "total_topics": len(topic_to_speech_id),
        "parties": list(set(filtered_parties)),
        "embedding_model": "all-MiniLM-L6-v2",
        "clustering_model": "KMeans",
        "n_clusters": optimal_k,  # Use the actual optimal_k value
        "representation_models": list(topic_model.representation_model.keys()) if hasattr(topic_model, 'representation_model') else []
    },
    "topic_summary": {
        # Convert NumPy int64 keys to strings for JSON compatibility
        str(topic): {
            "total_speeches": len(speech_ids),
            "parties": list(get_speech_ids_by_party_in_topic(topic).keys()),
            # Ensure party_counts values are standard Python ints
            "party_counts": {party: int(len(party_speech_ids)) for party, party_speech_ids in get_speech_ids_by_party_in_topic(topic).items()}
        }
        for topic, speech_ids in topic_to_speech_id.items()
    }
}

metadata_json_path = os.path.join(output_dir, "analysis_metadata.json")
with open(metadata_json_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"✓ Exported analysis metadata to: {metadata_json_path}")

print("\n" + "="*50)
print("EXPORT COMPLETE!")
print("="*50)
print(f"All files saved to: {output_dir}")
print("Files created:")
print(f"  1. {topics_csv_path}")
print(f"  2. {mapping_json_path}")
print(f"  3. {topic_json_path}")
print(f"  4. {speech_party_json_path}")
print(f"  5. {metadata_json_path}")