# Congressional Record and Hansard Analysis


This notebook contains the analysis of the Congressional Record and Hansard datasets.


## Setup


In [1]:
import ssl

import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')
nltk.download('stopwords')

congressional_record_path = '../congressional-record/dist/'
hansard_path = '../hansard-in-full/'
climate_dictionary_path = 'dictionaries/'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing


**Preprocessing functions for the Congressional Record and Hansard**


In [20]:
def clean_tokenize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    joined_tokens = ' '.join(filtered_tokens)
    return joined_tokens

### Congressional Record preprocessing


**Cleaning and tokenizing the Congressional Record**


In [2]:
congressional_record = pd.read_csv(
    congressional_record_path + 'individual_congresses/congress_111.csv')
stop_words = set(stopwords.words('english'))

congressional_record['cleaned_tokens'] = congressional_record['speech'].apply(
    clean_tokenize)

**Removing any documents with fewer than 10 words**


In [3]:
congressional_record['token_count'] = congressional_record['cleaned_tokens'].apply(
    lambda x: len(x.split()))

congressional_record = congressional_record[congressional_record['token_count'] > 10]

### Hansard preprocessing


**Loading Hansard**


In [27]:
hansard = pd.read_csv(hansard_path + 'hansard_with_mp_details.csv')

**Removing Hansard content from before 1996 and after 2016**


In [28]:
hansard['speech_date'] = pd.to_datetime(hansard['speech_date'])
hansard['year'] = hansard['speech_date'].dt.year

hansard = hansard[(hansard['year'] >= 2009) & (hansard['year'] <= 2011)]

**Cleaning and tokenizing Hansard**


In [29]:
stop_words = set(stopwords.words('english'))

hansard['cleaned_tokens'] = hansard['text'].apply(clean_tokenize)

**Removing any documents with fewer than 10 words**


In [30]:
hansard['token_count'] = hansard['cleaned_tokens'].apply(
    lambda x: len(x.split()))

hansard = hansard[hansard['token_count'] > 10]

## Filtering


**Filtering functions for the Congressional Record and Hansard**


In [31]:
def get_ngrams(corpus, ngram_range):
    vectorizer = CountVectorizer(
        tokenizer=None, preprocessor=None, ngram_range=ngram_range)
    count_matrix = vectorizer.fit_transform(corpus)
    word_sums = count_matrix.sum(axis=0)
    word_sums = np.array(word_sums).flatten()
    feature_names = vectorizer.get_feature_names_out()
    word_freq = zip(feature_names, word_sums)
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    return sorted_word_freq


def procedural_proportion(doc, top_trigrams):
    doc_trigrams = list(nltk.trigrams(doc.split()))
    doc_trigrams = [' '.join(trigram) for trigram in doc_trigrams]
    procedural_count = sum(trigram in top_trigrams for trigram in doc_trigrams)
    if len(doc_trigrams) == 0:
        return 0
    return procedural_count / len(doc_trigrams)


def contains_climate_term(doc):
    doc_tokens = set(doc.split())
    climate_term_count = sum(term in doc_tokens for term in climate_terms)
    total_terms = len(doc_tokens)

    if total_terms == 0:
        return 0
    return climate_term_count / total_terms

### Congressional Record filtering


**Finding the 100 most frequently used trigrams in the Congressional Record**


In [5]:
trigrams = get_ngrams(congressional_record['cleaned_tokens'], (3, 3))
trigrams_df = pd.DataFrame(trigrams, columns=['trigram', 'frequency'])
top_100_trigrams = trigrams_df['trigram'][:100]

**Removing documents from the Congressional Record with over 20% of trirgams that match the 100 most frequently used trigrams**


In [6]:
top_100_trigrams_list = top_100_trigrams.tolist()
congressional_record['procedural_proportion'] = congressional_record['cleaned_tokens'].apply(
    lambda doc: procedural_proportion(doc, top_100_trigrams_list))

congressional_record = congressional_record[congressional_record['procedural_proportion'] < 0.2]

**Extracting documents that discuss climate change**


In [14]:
climate_dictionary = pd.read_csv(
    climate_dictionary_path + 'cleaned_climate_dictionary.csv')

climate_terms = set(climate_dictionary['term'].str.lower().tolist())

congressional_record['climate_term_proportion'] = congressional_record['cleaned_tokens'].apply(
    contains_climate_term)
congressional_record = congressional_record.sort_values(
    'climate_term_proportion', ascending=False)

climate_congressional_record = congressional_record[
    congressional_record['climate_term_proportion'] > 0.01]

### Hansard filtering


**Finding the 100 most frequently used trigrams in Hansard**


In [32]:
trigrams = get_ngrams(hansard['cleaned_tokens'], (3, 3))
trigrams_df = pd.DataFrame(trigrams, columns=['trigram', 'frequency'])
top_100_trigrams = trigrams_df['trigram'][:100]

**Removing documents from Hansard with over 20% of trirgams that match the 100 most frequently used trigrams**


In [33]:
top_100_trigrams_list = top_100_trigrams.tolist()
hansard['procedural_proportion'] = hansard['cleaned_tokens'].apply(
    lambda doc: procedural_proportion(doc, top_100_trigrams_list))

hansard = hansard[hansard['procedural_proportion'] < 0.2]

**Extracting documents that discuss climate change**


In [34]:
climate_dictionary = pd.read_csv(
    climate_dictionary_path + 'cleaned_climate_dictionary.csv')

climate_terms = set(climate_dictionary['term'].str.lower().tolist())

hansard['climate_term_proportion'] = hansard['cleaned_tokens'].apply(
    contains_climate_term)
hansard = hansard.sort_values(
    'climate_term_proportion', ascending=False)

climate_hansard = hansard[hansard['climate_term_proportion'] > 0.01]