# Filtering the Congressional Record and Hansard


## Setup


In [1]:
import os
import shutil
import ssl
import re
import nltk
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import random
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score

CONGRESSIONAL_RECORD_PATH = '../../congressional-record/dist/'
HANSARD_PATH = '../../hansard-in-full/'
CLIMATE_DICTIONARY_PATH = '../dictionaries/dist/'
CONGRESSIONAL_RECORD_PROCEDURAL_STEMS_PATH = '../dictionaries/dist/'
HANSARD_PROCEDURAL_STEMS_PATH = '../dictionaries/dist/'

DATA_PATH = 'data/'
DIST_PATH = 'dist/'

YEAR_RANGE = (1997, 2015)

plt.style.use('ggplot')

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing


### Preprocessing functions for the Congressional Record and Hansard


In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def tokenize_and_stem(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]

    stemmed_tokens_with_original = [
        (token, stemmer.stem(token)) for token in filtered_tokens]
    return stemmed_tokens_with_original


def corpus_cleaning_tokenizing_stemming(corpus: pd.DataFrame, text_column_name: str, year: int):
    corpus = corpus.copy()

    corpus = corpus[corpus['year'] == year]

    tqdm.pandas(desc=f"Processing Text for Year {year}")
    corpus['cleaned_stems_with_original'] = corpus[text_column_name].progress_apply(
        tokenize_and_stem)

    # Remove any documents with fewer than 30 stems
    corpus['stem_count'] = corpus['cleaned_stems_with_original'].apply(len)
    corpus = corpus[corpus['stem_count'] >= 30]

    return corpus


def process_dataframe(df, text_column_name, year_range, data_path):
    temp_dir = os.path.join(data_path, "temp_yearly_dataframes")
    os.makedirs(temp_dir, exist_ok=True)

    for year in range(year_range[0], year_range[1] + 1):
        yearly_df = corpus_cleaning_tokenizing_stemming(
            df, text_column_name, year)
        yearly_df.to_pickle(os.path.join(
            temp_dir, f"congressional_record_{year}.pkl"))

    return temp_dir


def concatenate_dataframes(temp_dir, year_range):
    yearly_dataframes = []

    for year in range(year_range[0], year_range[1] + 1):
        print(f"Loading Year {year}")
        yearly_df = pd.read_pickle(os.path.join(
            temp_dir, f"congressional_record_{year}.pkl"))
        yearly_dataframes.append(yearly_df)
        del yearly_df  # Free memory by deleting the yearly dataframe after appending

    processed_df = pd.concat(yearly_dataframes, ignore_index=True)

    return processed_df


def corpus_filtering(corpus: pd.DataFrame, min_df: int = 10):
    # Create a set of allowed stems based on the min_df threshold
    vectorizer = CountVectorizer(min_df=min_df)
    vectorizer.fit_transform(corpus['cleaned_stems_with_original'].apply(
        lambda x: ' '.join([stem for _, stem in x])))
    allowed_stems = set(vectorizer.get_feature_names_out())

    # Filter out stems that occur in less than min_df documents
    tqdm.pandas(desc="Filtering Stems")
    corpus['cleaned_stems_with_original'] = corpus['cleaned_stems_with_original'].progress_apply(
        lambda x: [(token, stem) for token, stem in x if stem in allowed_stems])

    return corpus

### Congressional Record preprocessing


In [3]:
congressional_record = pd.read_csv(
    CONGRESSIONAL_RECORD_PATH + 'congressional_record.csv')

**Preprocessing the Congressional Record**


In [4]:
congressional_record['date'] = pd.to_datetime(
    congressional_record['date'], format='%Y%m%d')
congressional_record['year'] = congressional_record['date'].dt.year

# Remove any speeches with missing speaker IDs
congressional_record = congressional_record.dropna(subset=['speaker_id'])

# Remove any speeches that are not from Democrats or Republicans
congressional_record = congressional_record[
    congressional_record['party'].isin(['D', 'R'])]

temp_dir = process_dataframe(
    congressional_record, 'speech', YEAR_RANGE, DATA_PATH)

processed_congressional_record = concatenate_dataframes(
    'data/temp_yearly_dataframes', YEAR_RANGE)

processed_congressional_record = corpus_filtering(
    processed_congressional_record)

processed_congressional_record.to_parquet(
    DATA_PATH + 'congressional_record.parquet')

# Free memory by deleting the processed dataframe
del processed_congressional_record

shutil.rmtree(temp_dir)

Processing Text for Year 1997: 100%|██████████| 69432/69432 [01:29<00:00, 775.46it/s] 
Processing Text for Year 1998: 100%|██████████| 71324/71324 [01:29<00:00, 796.31it/s] 
Processing Text for Year 1999: 100%|██████████| 75469/75469 [01:38<00:00, 768.71it/s] 
Processing Text for Year 2000: 100%|██████████| 65399/65399 [01:27<00:00, 748.94it/s] 
Processing Text for Year 2001: 100%|██████████| 63817/63817 [01:27<00:00, 732.94it/s] 
Processing Text for Year 2002: 100%|██████████| 52146/52146 [01:12<00:00, 715.58it/s] 
Processing Text for Year 2003: 100%|██████████| 73362/73362 [01:39<00:00, 733.75it/s] 
Processing Text for Year 2004: 100%|██████████| 53941/53941 [01:16<00:00, 705.87it/s] 
Processing Text for Year 2005: 100%|██████████| 64182/64182 [01:31<00:00, 702.87it/s] 
Processing Text for Year 2006: 100%|██████████| 52510/52510 [01:12<00:00, 722.29it/s] 
Processing Text for Year 2007: 100%|██████████| 80177/80177 [01:48<00:00, 737.61it/s] 
Processing Text for Year 2008: 100%|███████

Loading Year 1997
Loading Year 1998
Loading Year 1999
Loading Year 2000
Loading Year 2001
Loading Year 2002
Loading Year 2003
Loading Year 2004
Loading Year 2005
Loading Year 2006
Loading Year 2007
Loading Year 2008
Loading Year 2009
Loading Year 2010
Loading Year 2011
Loading Year 2012
Loading Year 2013
Loading Year 2014
Loading Year 2015


Filtering Stems: 100%|██████████| 526039/526039 [04:51<00:00, 1805.84it/s] 


### Hansard preprocessing


In [5]:
hansard = pd.read_csv(HANSARD_PATH + 'hansard_with_mp_details.csv')

**Preprocessing Hansard**


In [6]:
hansard['speech_date'] = pd.to_datetime(hansard['speech_date'])
hansard['year'] = hansard['speech_date'].dt.year

# Remove any speeches with missing memberships
hansard = hansard.dropna(subset=['memberships'])

# Clean party names
hansard['speech_party'] = hansard['speech_party'].replace(
    {'Labour/Co-operative': 'Labour', 'Independent Labour': 'Labour', 'Independent Conservative': 'Conservative'})

# Remove any speeches that are not from Labour or Conservative MPs
hansard = hansard[hansard['speech_party'].isin(['Labour', 'Conservative'])]

temp_dir = process_dataframe(hansard, 'text', YEAR_RANGE, DATA_PATH)

processed_hansard = concatenate_dataframes(temp_dir, YEAR_RANGE)

processed_hansard = corpus_filtering(processed_hansard)

processed_hansard.to_parquet(DATA_PATH + 'hansard.parquet')

# Free memory by deleting the processed dataframe
del processed_hansard

shutil.rmtree(temp_dir)

Processing Text for Year 1997: 100%|██████████| 24339/24339 [00:26<00:00, 922.33it/s] 
Processing Text for Year 1998: 100%|██████████| 51348/51348 [00:51<00:00, 995.76it/s] 
Processing Text for Year 1999: 100%|██████████| 47274/47274 [00:48<00:00, 968.70it/s] 
Processing Text for Year 2000: 100%|██████████| 48736/48736 [00:46<00:00, 1042.47it/s]
Processing Text for Year 2001: 100%|██████████| 39627/39627 [00:26<00:00, 1493.44it/s]
Processing Text for Year 2002: 100%|██████████| 40057/40057 [00:14<00:00, 2857.01it/s]
Processing Text for Year 2003: 100%|██████████| 44968/44968 [00:16<00:00, 2777.20it/s]
Processing Text for Year 2004: 100%|██████████| 45883/45883 [00:16<00:00, 2751.11it/s]
Processing Text for Year 2005: 100%|██████████| 43149/43149 [00:15<00:00, 2715.22it/s]
Processing Text for Year 2006: 100%|██████████| 49239/49239 [00:18<00:00, 2667.33it/s]
Processing Text for Year 2007: 100%|██████████| 51223/51223 [00:19<00:00, 2578.38it/s]
Processing Text for Year 2008: 100%|███████

Loading Year 1997
Loading Year 1998
Loading Year 1999
Loading Year 2000
Loading Year 2001
Loading Year 2002
Loading Year 2003
Loading Year 2004
Loading Year 2005
Loading Year 2006
Loading Year 2007
Loading Year 2008
Loading Year 2009
Loading Year 2010
Loading Year 2011
Loading Year 2012
Loading Year 2013
Loading Year 2014
Loading Year 2015


Filtering Stems: 100%|██████████| 473093/473093 [00:29<00:00, 16056.36it/s] 


## Filtering


### Filtering functions for the Congressional Record and Hansard


In [7]:
def term_proportion(doc_stems, terms):
    doc_tokens = set(doc_stems)
    term_count = sum(term in doc_tokens for term in terms)
    total_terms = len(doc_tokens)
    if total_terms == 0:
        return 0
    return term_count / total_terms


def procedural_stems_filter(corpus_df, procedural_stems, threshold: float = 0.5):
    corpus_df = corpus_df.copy()

    # Remove documents that contain more than the threshold of procedural stems
    tqdm.pandas(desc="Calculating procedural proportion")
    corpus_df['procedural_proportion'] = corpus_df['cleaned_stems_with_original'].progress_apply(
        lambda doc: term_proportion([stem for _, stem in doc], procedural_stems))
    corpus_df = corpus_df[corpus_df['procedural_proportion'] < threshold]

    # Remove all procedural stems from the remaining documents
    tqdm.pandas(desc="Removing procedural stems")
    corpus_df['cleaned_stems_with_original'] = corpus_df['cleaned_stems_with_original'].progress_apply(
        lambda doc: [(token, stem) for token, stem in doc if stem not in procedural_stems])

    tqdm.pandas(desc="Joining cleaned stems")
    corpus_df['cleaned_stems'] = corpus_df['cleaned_stems_with_original'].progress_apply(
        lambda doc: ' '.join([stem for _, stem in doc]))

    tqdm.pandas(desc="Joining cleaned tokens")
    corpus_df['cleaned_tokens'] = corpus_df['cleaned_stems_with_original'].progress_apply(
        lambda doc: ' '.join([token for token, _ in doc]))

    corpus_df = corpus_df.drop(columns=['cleaned_stems_with_original'])

    return corpus_df


def topic_stems_filter(corpus_df, topic_stems, threshold: float = 0.2):
    corpus_df = corpus_df.copy()

    # Remove any documents that contain fewer than the threshold proportion of topic stems
    corpus_df['topic_proportion'] = corpus_df['cleaned_stems'].apply(
        lambda doc: term_proportion(doc.split(), topic_stems)
    )
    corpus_df = corpus_df[corpus_df['topic_proportion'] > threshold]

    return corpus_df

### Removing procedural documents and stems from the Congressional Record and Hansard


In [8]:
congressional_record = pd.read_parquet(
    DATA_PATH + 'congressional_record.parquet')
hansard = pd.read_parquet(DATA_PATH + 'hansard.parquet')

congressional_record_procedural_stems = pd.read_csv(
    CONGRESSIONAL_RECORD_PROCEDURAL_STEMS_PATH + 'shortened_congressional_record_procedural_stems.csv')
congressional_record_procedural_stems = set(
    congressional_record_procedural_stems['stem'].tolist())

hansard_procedural_stems = pd.read_csv(
    HANSARD_PROCEDURAL_STEMS_PATH + 'expanded_hansard_procedural_stems.csv')
hansard_procedural_stems = set(hansard_procedural_stems['stem'].tolist())

climate_stems = pd.read_csv(
    CLIMATE_DICTIONARY_PATH + 'shortened_climate_stems.csv')
climate_stems = set(climate_stems['stem'].tolist())

**Removing procedural documents and stems from the Congressional Record**


In [9]:
non_procedural_congressional_record = procedural_stems_filter(
    congressional_record, congressional_record_procedural_stems, 0.5)

non_procedural_congressional_record.to_csv(
    DATA_PATH + 'non_procedural_congressional_record.csv', index=False)

Calculating procedural proportion: 100%|██████████| 526039/526039 [01:51<00:00, 4721.04it/s]
Removing procedural stems: 100%|██████████| 478172/478172 [01:35<00:00, 5000.94it/s]
Joining cleaned stems: 100%|██████████| 478172/478172 [00:07<00:00, 63469.07it/s]
Joining cleaned tokens: 100%|██████████| 478172/478172 [00:07<00:00, 67476.77it/s]


**Removing procedural documents and stems from Hansard**


In [10]:
non_procedural_hansard = procedural_stems_filter(
    hansard, hansard_procedural_stems, 0.5)

non_procedural_hansard.to_csv(
    DATA_PATH + 'non_procedural_hansard.csv', index=False)

Calculating procedural proportion: 100%|██████████| 473093/473093 [00:26<00:00, 17985.83it/s]
Removing procedural stems: 100%|██████████| 448828/448828 [00:14<00:00, 31575.21it/s]
Joining cleaned stems: 100%|██████████| 448828/448828 [00:01<00:00, 439997.74it/s]
Joining cleaned tokens: 100%|██████████| 448828/448828 [00:01<00:00, 432115.09it/s]


### Separating climate change documents from the Congressional Record and Hansard


**Separating climate change documents from the Congressional Record**


In [11]:
non_procedural_congressional_record = pd.read_csv(
    DATA_PATH + 'non_procedural_congressional_record.csv')

climate_congressional_record = topic_stems_filter(
    non_procedural_congressional_record, climate_stems, 0.02)
climate_congressional_record = climate_congressional_record.sort_values(
    'topic_proportion', ascending=False)

climate_congressional_record.to_csv(
    DATA_PATH + 'climate_congressional_record.csv', index=False)

**Separating climate change documents from Hansard**


In [12]:
non_procedural_hansard = pd.read_csv(DATA_PATH + 'non_procedural_hansard.csv')

climate_hansard = topic_stems_filter(
    non_procedural_hansard, climate_stems, 0.025)
climate_hansard = climate_hansard.sort_values(
    'topic_proportion', ascending=False)

climate_hansard.to_csv(DATA_PATH + 'climate_hansard.csv', index=False)