In [None]:
import pandas as pd
import cudf
from cuml.feature_extraction.text import TfidfVectorizer as CumlTfidfVectorizer
import joblib
from pathlib import Path
import cupy

In [None]:

base_output_dir = Path("data/vocabulary_dumps") # Base directory for vocabulary files
base_dir = Path("../data/merged")


# Ensure the output directory exists
base_output_dir.mkdir(parents=True, exist_ok=True)

cuml_vocab_parquet_path = base_output_dir / "global_vocabulary_cuml.parquet"
sklearn_vocab_dict_path = base_output_dir / "global_vocabulary_sklearn.joblib"

all_speeches_list_cpu = []

print("Loading and aggregating speeches from all years (CPU)...")

for i in range(75, 112):
    
    "------ Loading files -------"
    print(f'── Processing Congress {i} ──')
    year_str = f"{i:03}"
    house_file = base_dir / f"house_db/house_merged_{year_str}.csv"
    if base_dir.exists():
        try:
            df_vocab = pd.read_csv(house_file, usecols=['speech'])
            df_vocab.dropna(subset=['speech'], inplace=True)
            all_speeches_list_cpu.extend(df_vocab['speech'].astype(str).tolist()) # Ensure speech is string
        except Exception as e:
            print(f"Error processing file {house_file}: {e}")
    else:
        print(f"Warning: File not found for vocab generation: {house_file}")

if not all_speeches_list_cpu:
    print("Error: No speeches collected for vocabulary generation. Exiting.")
    exit()

print(f"Collected {len(all_speeches_list_cpu)} speeches.")

# --- 2. Transfer to GPU (CPU -> GPU) ---
print("Transferring speeches to GPU...")
try:
    all_speeches_cudf_series = cudf.Series(all_speeches_list_cpu)
    del all_speeches_list_cpu # Free CPU memory
except Exception as e:
    print(f"Error converting list to cudf.Series: {e}")
    exit()

# --- 3. Vocabulary Computation (GPU) using cuML ---
print("Building global vocabulary with cuML TfidfVectorizer (GPU)...")
try:
    vocab_builder_vectorizer_cuml = CumlTfidfVectorizer(
        min_df=200,
        ngram_range=(1, 1) # Unigrams
        # cuML's TfidfVectorizer doesn't have a direct 'stop_words=None' like sklearn;
        # its default behavior is typically no stop word removal unless a list is provided.
    )
    vocab_builder_vectorizer_cuml.fit(all_speeches_cudf_series)
    del all_speeches_cudf_series # Free GPU memory
except Exception as e:
    print(f"Error during cuML TfidfVectorizer fitting: {e}")
    exit()

# The vocabulary_ attribute in cuML's fitted TfidfVectorizer is a cudf.Series of the terms,
# sorted alphabetically (this is typical for cuML's vocabulary handling).
gpu_terms_cudf_series = vocab_builder_vectorizer_cuml.vocabulary_
print(f"Generated global vocabulary with {len(gpu_terms_cudf_series)} features on GPU.")

# --- 4. Create and Save Dumps ---

# 4a. For cuML: Save the cudf.Series of terms to a Parquet file
print(f"Saving cuML vocabulary (cudf.Series of terms) to {cuml_vocab_parquet_path}...")
try:
    # To save a Series to Parquet, it's often easiest to convert it to a DataFrame with one column
    gpu_terms_cudf_series.to_frame(name='term').to_parquet(cuml_vocab_parquet_path)
    print(f"Successfully saved cuML vocabulary to {cuml_vocab_parquet_path}")
except Exception as e:
    print(f"Error saving cuML vocabulary to Parquet: {e}")

# 4b. For Scikit-learn: Create a dictionary {'term': index} and save with joblib
print(f"Creating and saving scikit-learn vocabulary (dictionary) to {sklearn_vocab_dict_path}...")
try:
    # Convert the cuDF Series of terms (on GPU) to a Python list (on CPU)
    # The order in gpu_terms_cudf_series determines the indices.
    cpu_terms_list = gpu_terms_cudf_series.to_pandas().tolist()
    
    # Create the scikit-learn style vocabulary dictionary
    sklearn_vocab_dict = {term: idx for idx, term in enumerate(cpu_terms_list)}
    
    joblib.dump(sklearn_vocab_dict, sklearn_vocab_dict_path)
    print(f"Successfully saved scikit-learn vocabulary dictionary to {sklearn_vocab_dict_path}")
except Exception as e:
    print(f"Error creating/saving scikit-learn vocabulary dictionary: {e}")

print("\n--- Vocabulary Generation Finished ---")
print(f"cuML vocabulary (Parquet): {cuml_vocab_parquet_path}")
print(f"Scikit-learn vocabulary (joblib dict): {sklearn_vocab_dict_path}")

# --- Cleanup ---
del vocab_builder_vectorizer_cuml
if 'gpu_terms_cudf_series' in locals(): del gpu_terms_cudf_series
if 'sklearn_vocab_dict' in locals(): del sklearn_vocab_dict
if 'cpu_terms_list' in locals(): del cpu_terms_list

cupy.get_default_memory_pool().free_all_blocks()

# CPU version (actual used)

In [None]:
import pandas as pd
from pathlib import Path
import re # Using regex for more flexible searching
import joblib # For saving the scikit-learn dictionary

# Import scikit-learn's TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer

# cudf is needed only if you want to save the cuML-compatible vocab as Parquet
# If you don't have cuDF in the environment where this CPU script runs,
# you could save the cpu_terms_list as a simple text file or CSV instead for cuML.
# For now, I'll assume cuDF might be available or you'll adapt the cuML saving part.
try:
    import cudf
    CUDF_AVAILABLE = True
except ImportError:
    CUDF_AVAILABLE = False
    print("Warning: cudf not found. Will not be able to save cuML vocabulary in Parquet format directly from this script.")
    print("Consider saving as a text file or CSV instead for the cuML vocabulary list.")


# --- Configuration ---
BASE_OUTPUT_DIR = Path("/content/drive/MyDrive/congress-polarization-thesis/data/vocabulary_dumps") # Base directory for vocabulary files
BASE_DIR = Path("/content/drive/MyDrive/congress-polarization-thesis/data/processed")

# Ensure the output directory exists
BASE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Define output paths
CUML_VOCAB_OUTPUT_PATH = BASE_OUTPUT_DIR / "global_vocabulary_processed_v2_200_min_df_cuml_from_sklearn.parquet" # Changed name slightly for clarity
SKLEARN_VOCAB_DICT_PATH = BASE_OUTPUT_DIR / "global_vocabulary_processed_v2_200_min_df_sklearn_from_sklearn.joblib" # Changed name slightly

all_speeches_list_cpu = []
GLOBAL_MIN_DF = 200 # Your min_df setting
TOKEN_PATTERN_WORDS_ONLY = r"(?u)\b[a-zA-Z]{2,}\b" # Stricter token pattern

print("Loading and aggregating speeches from all years (CPU)...")

for i in range(76, 112): # Your full range
    print(f'── Processing Congress {i} ──')
    year_str = f"{i:03}"
    house_file = BASE_DIR / f"house_db/house_cleaned_{year_str}.csv" # Reading CLEANED files

    # Path check was: if base_dir.exists(): which checks the directory, not the file
    # Should be: if house_file.exists():
    if house_file.exists():
        try:
            df_vocab = pd.read_csv(house_file, usecols=['speech'])
            df_vocab.dropna(subset=['speech'], inplace=True)
            all_speeches_list_cpu.extend(df_vocab['speech'].astype(str).tolist())
        except Exception as e:
            print(f"Error processing file {house_file}: {e}")
    else:
        print(f"Warning: File not found for vocab generation: {house_file}")

if not all_speeches_list_cpu:
    print("Error: No speeches collected for vocabulary generation. Exiting.")
    exit()

print(f"Collected {len(all_speeches_list_cpu)} speeches.")

#------ Debugging: Inspect Speeches (as before) ------
print("\n--- Inspecting First 5 Speeches ---")
for i_debug, speech_debug in enumerate(all_speeches_list_cpu[:5]):
    print(f"Speech {i_debug}: {speech_debug[:200]}...")
    print(f"  Type: {type(speech_debug)}")
    print(f"  Length (chars): {len(speech_debug)}")
print("--------------------------------\n")

print("\n--- Checking for long 'words' ---")
long_words = []
for speech_debug in all_speeches_list_cpu[:10000]:
    if isinstance(speech_debug, str):
        words = speech_debug.split() # Simple split for this check
        for word in words:
            if len(word) > 50:
                long_words.append(word)
if long_words:
    print(f"Found {len(long_words)} 'words' longer than 50 chars. Examples: {long_words[:10]}")
else:
    print("No unusually long 'words' found in sample (based on simple split).")
print("-----------------------------\n")
#-------- End Debugging Speeches --------

# --- Vocabulary Computation (CPU) using Scikit-learn ---
print("Building global vocabulary with Scikit-learn TfidfVectorizer (CPU)...")
try:
    # Initialize scikit-learn's TfidfVectorizer
    # lowercase=False because your SpaCy script should already handle it.
    # token_pattern is crucial here if your SpaCy output is space-separated tokens.
    # If SpaCy outputs already tokenized text (joined by space), the default tokenizer
    # might be okay, but specifying the token_pattern gives more control.
    # Since your cleaned text is a string of space-separated lemmas, the default
    # tokenizer (which splits on whitespace and punctuation) might still try to
    # split your lemmas if they contained internal hyphens not caught by is_alpha.
    # Using the stricter token_pattern is safer.
    vocab_builder_vectorizer_sklearn = SklearnTfidfVectorizer(
        min_df=GLOBAL_MIN_DF,
        ngram_range=(1, 1),        # Unigrams
        #token_pattern=TOKEN_PATTERN_WORDS_ONLY, # Use the stricter pattern
        lowercase=False,           # False, Assuming already lowercased by SpaCy
        stop_words=None            # Assuming stop words removed by SpaCy
    )

    # Fit on the CPU list of speeches
    print(f"Fitting SklearnTfidfVectorizer on {len(all_speeches_list_cpu)} speeches...")
    vocab_builder_vectorizer_sklearn.fit(all_speeches_list_cpu)

    # Get the vocabulary dictionary for scikit-learn
    sklearn_vocab_dict = vocab_builder_vectorizer_sklearn.vocabulary_

    # Get the feature names (terms) in the order of their indices
    # This list will be used for the cuML-compatible output
    cpu_terms_list = vocab_builder_vectorizer_sklearn.get_feature_names_out().tolist()

    print(f"Generated global vocabulary with {len(cpu_terms_list)} features using scikit-learn.")

except Exception as e:
    print(f"Error during Scikit-learn TfidfVectorizer fitting: {e}")
    # Consider exiting or handling more gracefully
    sklearn_vocab_dict = {}
    cpu_terms_list = []
    # exit() # Optional: exit if vocab building fails

#--- Debugging: Inspect Vocabulary (as before) ---
if cpu_terms_list:
    print("\n--- Inspecting First 100 Vocabulary Terms (from scikit-learn) ---")
    print(cpu_terms_list[:100])
    print("-----------------------------------------------------------------\n")
# --- End Debugging Vocabulary ---


# --- Create and Save Dumps ---

# 1. For Scikit-learn: Save the dictionary {'term': index}
print(f"Creating and saving scikit-learn vocabulary (dictionary) to {SKLEARN_VOCAB_DICT_PATH}...")
if sklearn_vocab_dict:
    try:
        joblib.dump(sklearn_vocab_dict, SKLEARN_VOCAB_DICT_PATH)
        print(f"Successfully saved scikit-learn vocabulary dictionary to {SKLEARN_VOCAB_DICT_PATH}")
    except Exception as e:
        print(f"Error creating/saving scikit-learn vocabulary dictionary: {e}")
else:
    print("Scikit-learn vocabulary dictionary is empty, not saving.")


# 2. For cuML: Save the list of terms (cpu_terms_list)
#    Option A: As Parquet using cuDF (if available)
#    Option B: As a simple text file (one term per line) if cuDF is not available
print(f"Saving cuML-compatible vocabulary (list of terms)...")
if cpu_terms_list:
    if CUDF_AVAILABLE:
        print(f"Attempting to save as Parquet to {CUML_VOCAB_OUTPUT_PATH} using cuDF...")
        try:
            terms_cudf_series = cudf.Series(cpu_terms_list)
            terms_cudf_series.to_frame(name='term').to_parquet(CUML_VOCAB_OUTPUT_PATH)
            print(f"Successfully saved cuML-compatible vocabulary (Parquet) to {CUML_VOCAB_OUTPUT_PATH}")
        except Exception as e:
            print(f"Error saving cuML-compatible vocabulary to Parquet: {e}")
            print("Consider saving as a text file as a fallback if Parquet fails.")
            CUDF_AVAILABLE = False # Fallback to text if save fails

    if not CUDF_AVAILABLE: # If cuDF wasn't available or Parquet saving failed
        cuml_vocab_text_path = BASE_OUTPUT_DIR / "global_vocabulary_cuml_from_sklearn.txt"
        print(f"Saving cuML-compatible vocabulary as text file to {cuml_vocab_text_path}...")
        try:
            with open(cuml_vocab_text_path, 'w', encoding='utf-8') as f:
                for term in cpu_terms_list:
                    f.write(term + "\n")
            print(f"Successfully saved cuML-compatible vocabulary (text file) to {cuml_vocab_text_path}")
        except Exception as e:
            print(f"Error saving cuML-compatible vocabulary to text file: {e}")
else:
    print("CPU terms list is empty, not saving cuML-compatible vocabulary.")


print("\n--- Vocabulary Generation Finished ---")
if sklearn_vocab_dict:
    print(f"Scikit-learn vocabulary (joblib dict): {SKLEARN_VOCAB_DICT_PATH}")
if cpu_terms_list:
    if CUDF_AVAILABLE and CUML_VOCAB_OUTPUT_PATH.exists():
         print(f"cuML vocabulary (Parquet from scikit-learn terms): {CUML_VOCAB_OUTPUT_PATH}")
    elif not CUDF_AVAILABLE and (BASE_OUTPUT_DIR / "global_vocabulary_cuml_from_sklearn.txt").exists():
        print(f"cuML vocabulary (Text file from scikit-learn terms): {BASE_OUTPUT_DIR / 'global_vocabulary_cuml_from_sklearn.txt'}")


# --- Cleanup (CPU based, so no cupy cleanup needed) ---
if 'vocab_builder_vectorizer_sklearn' in locals(): del vocab_builder_vectorizer_sklearn
if 'sklearn_vocab_dict' in locals(): del sklearn_vocab_dict
if 'cpu_terms_list' in locals(): del cpu_terms_list
if 'all_speeches_list_cpu' in locals(): del all_speeches_list_cpu

vocabulary dimension

In [None]:
import joblib
import pandas as pd

VOCABULARY_PATH = "data/vocabulary_dumps"

# --- Load Vocabulary from Joblib File ---
joblib_file_path = '../data/vocabulary_dumps/global_vocabulary_processed_v2_200_min_df_sklearn_from_sklearn.joblib' # Replace with your .joblib file path

try:
    tfidf_vectorizer_joblib = joblib.load(joblib_file_path)
    # Assuming the loaded object is a scikit-learn TF-IDF Vectorizer
    # The vocabulary_ attribute is a dictionary where keys are terms and values are feature indices
    if hasattr(tfidf_vectorizer_joblib, 'vocabulary_'):
        joblib_vocab_len = len(tfidf_vectorizer_joblib.vocabulary_)
        print(f"Length of vocabulary from Joblib file ('{joblib_file_path}'): {joblib_vocab_len} features")
    elif hasattr(tfidf_vectorizer_joblib, 'get_feature_names_out'): # For newer scikit-learn versions
        joblib_vocab_len = len(tfidf_vectorizer_joblib.get_feature_names_out())
        print(f"Length of vocabulary from Joblib file ('{joblib_file_path}'): {joblib_vocab_len} features")
    else:
        # If it's just a list or dictionary of features already
        try:
            joblib_vocab_len = len(tfidf_vectorizer_joblib)
            print(f"Length of data from Joblib file ('{joblib_file_path}'): {joblib_vocab_len} features (assuming it's a direct vocabulary list/dict)")
        except TypeError:
            print(f"Error: Could not determine feature count from the object loaded from '{joblib_file_path}'. "
                  "Please ensure it's a TF-IDF vectorizer or a vocabulary list/dictionary.")

except FileNotFoundError:
    print(f"Error: Joblib file not found at '{joblib_file_path}'")
except Exception as e:
    print(f"An error occurred while loading the Joblib file: {e}")

print("-" * 30) # Separator

# --- Load Vocabulary from Parquet File ---
parquet_file_path = '../data/vocabulary_dumps/global_vocabulary_processed_v2_200_min_df_cuml_from_sklearn.parquet' # Replace with your .parquet file path

try:
    # Assuming the Parquet file contains a DataFrame where one column is the vocabulary
    # or the DataFrame itself represents the features in some way.
    df_parquet = pd.read_parquet(parquet_file_path)

    # Option 1: If the Parquet file directly contains a list/series of vocabulary terms
    # For example, a DataFrame with a single column named 'vocabulary' or 'features'
    if 'vocabulary' in df_parquet.columns and len(df_parquet.columns) == 1:
        parquet_vocab_len = len(df_parquet['vocabulary'])
        print(f"Length of vocabulary from Parquet file ('{parquet_file_path}', column 'vocabulary'): {parquet_vocab_len} features")
    elif 'feature' in df_parquet.columns and len(df_parquet.columns) == 1:
        parquet_vocab_len = len(df_parquet['feature'])
        print(f"Length of vocabulary from Parquet file ('{parquet_file_path}', column 'feature'): {parquet_vocab_len} features")
    # Option 2: If the Parquet file represents features where each row or column is a feature
    # This depends heavily on how you saved it.
    # If each row is a feature:
    # parquet_vocab_len = len(df_parquet)
    # If each column is a feature (less common for just vocabulary):
    # parquet_vocab_len = len(df_parquet.columns)
    # For this example, let's assume it's a DataFrame where the number of rows is the number of features.
    else:
        parquet_vocab_len = len(df_parquet) # Or use df_parquet.shape[0]
        print(f"Number of rows (assumed features) from Parquet file ('{parquet_file_path}'): {parquet_vocab_len}")
        # If your features are columns, you might want:
        # print(f"Number of columns from Parquet file ('{parquet_file_path}'): {df_parquet.shape[1]}")


except FileNotFoundError:
    print(f"Error: Parquet file not found at '{parquet_file_path}'")
except Exception as e:
    print(f"An error occurred while loading the Parquet file: {e}")

Length of data from Joblib file ('../data/vocabulary_dumps/global_vocabulary_processed_v2_200_min_df_sklearn_from_sklearn.joblib'): 15690 features (assuming it's a direct vocabulary list/dict)
------------------------------
Number of rows (assumed features) from Parquet file ('../data/vocabulary_dumps/global_vocabulary_processed_v2_200_min_df_cuml_from_sklearn.parquet'): 15690
