In [None]:
import pandas as pd
import cudf
from cuml.feature_extraction.text import TfidfVectorizer as CumlTfidfVectorizer
import joblib
from pathlib import Path
import cupy

In [None]:

base_output_dir = Path("data/vocabulary_dumps") # Base directory for vocabulary files
base_dir = Path("../data/merged")


# Ensure the output directory exists
base_output_dir.mkdir(parents=True, exist_ok=True)

cuml_vocab_parquet_path = base_output_dir / "global_vocabulary_cuml.parquet"
sklearn_vocab_dict_path = base_output_dir / "global_vocabulary_sklearn.joblib"

all_speeches_list_cpu = []
global_min_df = 200

print("Loading and aggregating speeches from all years (CPU)...")

for i in range(75, 112):
    
    "------ Loading files -------"
    print(f'── Processing Congress {i} ──')
    year_str = f"{i:03}"
    house_file = base_dir / f"house_db/house_merged_{year_str}.csv"
    if base_dir.exists():
        try:
            df_vocab = pd.read_csv(house_file, usecols=['speech'])
            df_vocab.dropna(subset=['speech'], inplace=True)
            all_speeches_list_cpu.extend(df_vocab['speech'].astype(str).tolist()) # Ensure speech is string
        except Exception as e:
            print(f"Error processing file {house_file}: {e}")
    else:
        print(f"Warning: File not found for vocab generation: {house_file}")

if not all_speeches_list_cpu:
    print("Error: No speeches collected for vocabulary generation. Exiting.")
    exit()

print(f"Collected {len(all_speeches_list_cpu)} speeches.")

# --- 2. Transfer to GPU (CPU -> GPU) ---
print("Transferring speeches to GPU...")
try:
    all_speeches_cudf_series = cudf.Series(all_speeches_list_cpu)
    del all_speeches_list_cpu # Free CPU memory
except Exception as e:
    print(f"Error converting list to cudf.Series: {e}")
    exit()

# --- 3. Vocabulary Computation (GPU) using cuML ---
print("Building global vocabulary with cuML TfidfVectorizer (GPU)...")
try:
    vocab_builder_vectorizer_cuml = CumlTfidfVectorizer(
        min_df=global_min_df,
        ngram_range=(1, 1) # Unigrams
        # cuML's TfidfVectorizer doesn't have a direct 'stop_words=None' like sklearn;
        # its default behavior is typically no stop word removal unless a list is provided.
    )
    vocab_builder_vectorizer_cuml.fit(all_speeches_cudf_series)
    del all_speeches_cudf_series # Free GPU memory
except Exception as e:
    print(f"Error during cuML TfidfVectorizer fitting: {e}")
    exit()

# The vocabulary_ attribute in cuML's fitted TfidfVectorizer is a cudf.Series of the terms,
# sorted alphabetically (this is typical for cuML's vocabulary handling).
gpu_terms_cudf_series = vocab_builder_vectorizer_cuml.vocabulary_
print(f"Generated global vocabulary with {len(gpu_terms_cudf_series)} features on GPU.")

# --- 4. Create and Save Dumps ---

# 4a. For cuML: Save the cudf.Series of terms to a Parquet file
print(f"Saving cuML vocabulary (cudf.Series of terms) to {cuml_vocab_parquet_path}...")
try:
    # To save a Series to Parquet, it's often easiest to convert it to a DataFrame with one column
    gpu_terms_cudf_series.to_frame(name='term').to_parquet(cuml_vocab_parquet_path)
    print(f"Successfully saved cuML vocabulary to {cuml_vocab_parquet_path}")
except Exception as e:
    print(f"Error saving cuML vocabulary to Parquet: {e}")

# 4b. For Scikit-learn: Create a dictionary {'term': index} and save with joblib
print(f"Creating and saving scikit-learn vocabulary (dictionary) to {sklearn_vocab_dict_path}...")
try:
    # Convert the cuDF Series of terms (on GPU) to a Python list (on CPU)
    # The order in gpu_terms_cudf_series determines the indices.
    cpu_terms_list = gpu_terms_cudf_series.to_pandas().tolist()
    
    # Create the scikit-learn style vocabulary dictionary
    sklearn_vocab_dict = {term: idx for idx, term in enumerate(cpu_terms_list)}
    
    joblib.dump(sklearn_vocab_dict, sklearn_vocab_dict_path)
    print(f"Successfully saved scikit-learn vocabulary dictionary to {sklearn_vocab_dict_path}")
except Exception as e:
    print(f"Error creating/saving scikit-learn vocabulary dictionary: {e}")

print("\n--- Vocabulary Generation Finished ---")
print(f"cuML vocabulary (Parquet): {cuml_vocab_parquet_path}")
print(f"Scikit-learn vocabulary (joblib dict): {sklearn_vocab_dict_path}")

# --- Cleanup ---
del vocab_builder_vectorizer_cuml
if 'gpu_terms_cudf_series' in locals(): del gpu_terms_cudf_series
if 'sklearn_vocab_dict' in locals(): del sklearn_vocab_dict
if 'cpu_terms_list' in locals(): del cpu_terms_list

cupy.get_default_memory_pool().free_all_blocks()

In [None]:


# --- 1. Data Loading and Initial Aggregation (CPU) ---
all_speeches_list_cpu = []
congress_years_to_process = [f"{i:03}" for i in range(CONGRESS_YEAR_START, CONGRESS_YEAR_END + 1)] # Define these
global_min_df = 200 # As per paper example

print("Loading and aggregating speeches from all years (CPU)...")
for year_str_vocab in congress_years_to_process:
    input_csv_path_vocab = Path(f"data/processed/house_db/house_cleaned_{year_str_vocab}.csv") # Adjust path
    if input_csv_path_vocab.exists():
        df_vocab = pd.read_csv(input_csv_path_vocab, usecols=['speech'])
        df_vocab.dropna(subset=['speech'], inplace=True)
        all_speeches_list_cpu.extend(df_vocab['speech'].tolist())
    else:
        print(f"Warning: File not found for vocab generation: {input_csv_path_vocab}")

if not all_speeches_list_cpu:
    print("Error: No speeches collected for vocabulary generation. Exiting.")
    exit()

print(f"Collected {len(all_speeches_list_cpu)} speeches.")

# --- 2. Transfer to GPU (CPU -> GPU) ---
print("Transferring speeches to GPU...")
all_speeches_cudf_series = cudf.Series(all_speeches_list_cpu)
del all_speeches_list_cpu # Free CPU memory if large

# --- 3. Vocabulary Computation (GPU) ---
print("Building global vocabulary with cuML TfidfVectorizer (GPU)...")
vocab_builder_vectorizer_cuml = CumlTfidfVectorizer(
    min_df=global_min_df,
    ngram_range=(1, 1)
    # cuML's TfidfVectorizer might not have all the same params or defaults as sklearn's
    # for stop_words, etc. Check documentation if you need finer control matching sklearn.
    # For basic vocabulary building based on min_df and ngrams, this is the core.
)
vocab_builder_vectorizer_cuml.fit(all_speeches_cudf_series)
del all_speeches_cudf_series # Free GPU memory

# --- 4. Extracting and Saving Vocabulary (GPU -> CPU) ---
# cuML's TfidfVectorizer stores vocabulary in a sorted manner.
# The .vocabulary_ attribute in cuML is a cudf.Series of the terms.
gpu_vocabulary_terms = vocab_builder_vectorizer_cuml.vocabulary_ # This is a cudf.Series of terms

print(f"Generated global vocabulary with {len(gpu_vocabulary_terms)} features on GPU.")
print("Transferring vocabulary to CPU and saving...")

# Convert cuDF Series of terms to a Python list
fixed_vocabulary_list_cpu = gpu_vocabulary_terms.to_pandas().tolist()
del gpu_vocabulary_terms

vocabulary_save_path = Path("models/global_fixed_vocabulary_cuml.joblib") # distinguish if needed
vocabulary_save_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(fixed_vocabulary_list_cpu, vocabulary_save_path)
print(f"Saved global vocabulary list (from cuML) to {vocabulary_save_path}")