In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm # Import tqdm
from text_cleaning import clean_texts_for_tfidf_batch

# Text cleaner

In [None]:
# Text cleaner
for i in tqdm(range(76, 112), desc="Processing Congresses"):
    print(f"\n--Processing Congress {i} --") # Added newline for better readability with tqdm
    year_str = f"{i:03}"
    base_dir = Path("../data/merged")
    house_file = base_dir / f"house_db/house_merged_{year_str}.csv"

    cleaned_dir = Path("../data/processed")
    cleaned_dir.mkdir(parents=True, exist_ok=True)

    if not house_file.exists():
        print(f"File not found, skipping: {house_file}")
        continue

    try:
        df = pd.read_csv(house_file)
    except Exception as e:
        print(f"Error reading {house_file}: {e}. Skipping.")
        continue

    if "speech" in df.columns:
        print(f"Cleaning speeches for Congress {year_str}...")
        speeches_to_clean = df["speech"].fillna("").astype(str).tolist()

        if not speeches_to_clean:
            print(f"No speeches to clean in Congress {year_str} after filtering NaNs. Skipping cleaning.")
            continue

        # Use the batch cleaning function
        # If clean_texts_for_tfidf_batch has its own tqdm, it will be a nested bar
        cleaned_speeches_list = clean_texts_for_tfidf_batch(speeches_to_clean)
        df["cleaned_speech"] = cleaned_speeches_list

        output_cleaned = cleaned_dir / f"house_cleaned_{year_str}.csv"

        # Define columns to keep *after* 'cleaned_speech' is created
        columns_to_keep = ["speech_id", 'speakerid', 'party', 'cleaned_speech']
        # Ensure only existing columns are selected
        columns_to_keep = [col for col in columns_to_keep if col in df.columns]

        if 'cleaned_speech' in df.columns and 'cleaned_speech' in columns_to_keep:
            df_to_save = df[columns_to_keep].copy()
            df_to_save.rename(columns={'cleaned_speech': 'speech'}, inplace=True) # Rename for consistency

            try:
                df_to_save.to_csv(output_cleaned, index=False)
                print(f"Saved cleaned data for Congress {year_str} to {output_cleaned}")
            except Exception as e:
                print(f"Error saving cleaned data for Congress {year_str} to {output_cleaned}: {e}")
        elif 'cleaned_speech' not in df.columns:
            print(f"Column 'cleaned_speech' not generated for {year_str}. Skipping save.")
        else:
            print(f"Column 'cleaned_speech' was generated but not selected in columns_to_keep for {year_str}. Skipping save.")
    else:
        print(f"'speech' column not found in {house_file}. Skipping cleaning.")

print("\nAll Congresses processed.")

## Test

In [None]:
#test if the cleaned text is the same of the merged
for i in range(76, 112):
    
    "------ Loading files -------"
    
    print(f'── Processing Congress {i} ──')
    year_str = f"{i:03}"
    base_dir = Path("../data/merged")
    house_file = base_dir / f"house_db/house_merged_{year_str}.csv"
    
    base_dir_processed = Path("../data/processed")
    house_file_processed = base_dir_processed / f"house_db/house_cleaned_{year_str}.csv"
    
    df = pd.read_csv(house_file)
    df_processed = pd.read_csv(house_file_processed)
    
    matches = set(df["speech_id"]) == set(df_processed["speech_id"])

    print("The db are equal", matches)

In [1]:
import re
import string
from typing import List, Set

# NLTK imports
import nltk
from nltk.tokenize import word_tokenize
# Ensure NLTK resources are available.
# It's good practice to handle potential Lookuprrors if these haven't been downloaded.
try:
    STOPWORDS_CORPUS = nltk.corpus.stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    STOPWORDS_CORPUS = nltk.corpus.stopwords.words('english')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# SpaCy import
import spacy

In [2]:
# import spacy
NLP = spacy.load("en_core_web_sm", disable=["parser", "ner"])
print(NLP("representatives")[0].lemma_)
print(NLP("acting")[0].lemma_)
print(NLP("trod")[0].lemma_)

representative
act
trod
