# Filtering the Congressional Record and Hansard


This notebook contains the code needed to filter the Congressional Record and Hansard datasets. The code creates three pairs of corpuses with cleaned and stemmed text. The first pair contains only speeches from the Congressional Record and Hansard that discuss issues relating to climate change. The second contains all the other non-climate change related speeches. The third pair contains only speeches from the Congressional Record and Hansard that discuss counter-terrorism.


## Setup


In [1]:
import ssl
import re
import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

CONGRESSIONAL_RECORD_PATH = '../congressional-record/dist/'
HANSARD_PATH = '../hansard-in-full/'
CLIMATE_DICTIONARY_PATH = 'dictionaries/dist/'
CONGRESSIONAL_RECORD_PROCEDURAL_STEMS_PATH = 'dictionaries/dist/'
HANSARD_PROCEDURAL_STEMS_PATH = 'dictionaries/dist/'

DATA_PATH = 'data/'
DIST_PATH = 'dist/'

YEAR_RANGE = (1997, 2015)

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')
nltk.download('stopwords')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing


### Preprocessing functions for the Congressional Record and Hansard


In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def tokenize_and_stem(text):
    # Text should almost always be a string, but we check just in case
    if not isinstance(text, str):
        text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stem the tokens
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Rejoin the stemmed tokens
    joined_stems = ' '.join(stemmed_tokens)
    return joined_stems


def corpus_preprocessing(corpus: pd.DataFrame,
                         text_column_name: str,
                         year_range: tuple,
                         year_column_name: str = 'year'):
    corpus = corpus.copy()
    # Remove corpus content from outside the year range
    corpus = corpus[corpus[year_column_name].between(
        year_range[0], year_range[1])]
    # Clean, tokenize, and stem the corpus
    tqdm.pandas(desc="Processing Text")
    corpus['cleaned_stems'] = corpus[text_column_name].progress_apply(
        tokenize_and_stem)
    # Remove any documents with fewer than 10 stems
    corpus['stem_count'] = corpus['cleaned_stems'].apply(
        lambda x: len(x.split()))
    corpus = corpus[corpus['stem_count'] >= 10]
    return corpus

### Congressional Record preprocessing


**Loading the Congressional Record**


In [3]:
congressional_record = pd.read_csv(
    CONGRESSIONAL_RECORD_PATH + 'congressional_record.csv')

**Preprocessing the Congressional Record**


In [4]:
congressional_record['date'] = pd.to_datetime(
    congressional_record['date'], format='%Y%m%d')
congressional_record['year'] = congressional_record['date'].dt.year

congressional_record = corpus_preprocessing(
    congressional_record, 'speech', YEAR_RANGE)

congressional_record.to_csv(
    DATA_PATH + 'congressional_record.csv', index=False)

Processing Text: 100%|██████████| 1696694/1696694 [30:35<00:00, 924.62it/s] 


### Hansard preprocessing


**Loading Hansard**


In [5]:
hansard = pd.read_csv(HANSARD_PATH + 'hansard_with_mp_details.csv')

**Preprocessing Hansard**


In [6]:
hansard['speech_date'] = pd.to_datetime(hansard['speech_date'])
hansard['year'] = hansard['speech_date'].dt.year

hansard = corpus_preprocessing(hansard, 'text', YEAR_RANGE)

hansard.to_csv(DATA_PATH + 'hansard.csv', index=False)

Processing Text: 100%|██████████| 1125378/1125378 [08:51<00:00, 2115.96it/s]


## Filtering


### Filtering functions for the Congressional Record and Hansard


In [7]:
hansard = pd.read_csv(DATA_PATH + 'hansard.csv')
congressional_record = pd.read_csv(DATA_PATH + 'congressional_record.csv')

congressional_record_procedural_stems = pd.read_csv(
    CONGRESSIONAL_RECORD_PROCEDURAL_STEMS_PATH + 'shortened_congressional_record_procedural_stems.csv')
congressional_record_procedural_stems = set(
    congressional_record_procedural_stems['stem'].tolist())
hansard_procedural_stems = pd.read_csv(
    HANSARD_PROCEDURAL_STEMS_PATH + 'shortened_hansard_procedural_stems.csv')
hansard_procedural_stems = set(hansard_procedural_stems['stem'].tolist())

climate_stems = pd.read_csv(
    CLIMATE_DICTIONARY_PATH + 'climate_stems.csv')
climate_stems = set(climate_stems['stem'].tolist())


def term_proportion(doc, terms):
    doc_tokens = set(doc.split())
    term_count = sum(
        term in doc_tokens for term in terms)
    total_terms = len(doc_tokens)

    if total_terms == 0:
        return 0
    return term_count / total_terms


def procedural_stems_filter(corpus_df, procedural_stems, threshold: float = 0.5):
    corpus_df = corpus_df.copy()
    # Remove documents that contain more than than the threshold of procedural stems
    corpus_df['procedural_proportion'] = corpus_df['cleaned_stems'].apply(
        lambda doc: term_proportion(doc, procedural_stems))
    corpus_df = corpus_df[corpus_df['procedural_proportion'] < threshold]
    # Remove all procedural stems from the remaining documents
    corpus_df['cleaned_stems'] = corpus_df['cleaned_stems'].apply(
        lambda doc: ' '.join([word for word in doc.split() if word not in procedural_stems]))
    return corpus_df


def topic_stems_filter(corpus_df, topic_stems, threshold: float = 0.2):
    corpus_df = corpus_df.copy()
    # Remove any documents that contain fewer than the threshold proportion of topic stems
    corpus_df['topic_proportion'] = corpus_df['cleaned_stems'].apply(
        lambda doc: term_proportion(doc, topic_stems))
    corpus_df = corpus_df[corpus_df['topic_proportion'] > threshold]
    return corpus_df

### Removing procedural documents and stems from the Congressional Record and Hansard


**Removing procedural documents and stems from the Congressional Record**


In [8]:
non_procedural_congressional_record = procedural_stems_filter(
    congressional_record, congressional_record_procedural_stems, 0.5)

non_procedural_congressional_record.to_csv(
    DATA_PATH + 'non_procedural_congressional_record.csv', index=False)

**Removing procedural documents and stems from Hansard**


In [9]:
non_procedural_hansard = procedural_stems_filter(
    hansard, hansard_procedural_stems, 0.5)

non_procedural_hansard.to_csv(
    DATA_PATH + 'non_procedural_hansard.csv', index=False)

### Separating climate change documents from the Congressional Record and Hansard


**Separating climate change documents from the Congressional Record**


In [10]:
climate_congressional_record = topic_stems_filter(
    non_procedural_congressional_record, climate_stems, 0.2)
climate_congressional_record = climate_congressional_record.sort_values(
    'topic_proportion', ascending=False)

climate_indices = climate_congressional_record.index
non_climate_congressional_record = non_procedural_congressional_record.drop(
    climate_indices)

climate_congressional_record.to_csv(
    DATA_PATH + 'climate_congressional_record.csv', index=False)
non_climate_congressional_record.to_csv(
    DATA_PATH + 'non_climate_congressional_record.csv', index=False)

**Separating climate change documents from Hansard**


In [11]:
climate_hansard = topic_stems_filter(
    non_procedural_hansard, climate_stems, 0.2)
climate_hansard = climate_hansard.sort_values(
    'topic_proportion', ascending=False)

climate_indices = climate_hansard.index
non_climate_hansard = non_procedural_hansard.drop(climate_indices)

climate_hansard.to_csv(DATA_PATH + 'climate_hansard.csv', index=False)
non_climate_hansard.to_csv(DATA_PATH + 'non_climate_hansard.csv', index=False)