# Congressional Record and Hansard Analysis


This notebook contains the analysis of the Congressional Record and Hansard datasets.


## Setup


In [1]:
import ssl

import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')
nltk.download('stopwords')

congressional_record_path = '../congressional-record/dist/'
climate_dictionary_path = 'dist/'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing


### Congressional Record preprocessing


In [2]:
congressional_record = pd.read_csv(
    congressional_record_path + 'individual_congresses/congress_111.csv')
stop_words = set(stopwords.words('english'))


# Clean and tokenize text
def clean_tokenize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    joined_tokens = ' '.join(filtered_tokens)
    return joined_tokens


congressional_record['cleaned_tokens'] = congressional_record['speech'].apply(
    clean_tokenize)

**Remove any documents that are less than 10 words**


In [3]:
congressional_record['token_count'] = congressional_record['cleaned_tokens'].apply(
    lambda x: len(x.split()))

congressional_record = congressional_record[congressional_record['token_count'] > 10]

## Filtering


### Congressional Record filtering


**Returning n-gram frequencies from the Congressional Record**


In [4]:
def get_ngrams(corpus, ngram_range):
    vectorizer = CountVectorizer(
        tokenizer=None, preprocessor=None, ngram_range=ngram_range)
    count_matrix = vectorizer.fit_transform(corpus)
    word_sums = count_matrix.sum(axis=0)
    word_sums = np.array(word_sums).flatten()
    feature_names = vectorizer.get_feature_names_out()
    word_freq = zip(feature_names, word_sums)
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    return sorted_word_freq

**Finding the 100 most frequently used trigrams in the Congressional Record**


In [5]:
trigrams = get_ngrams(congressional_record['cleaned_tokens'], (3, 3))
trigrams_df = pd.DataFrame(trigrams, columns=['trigram', 'frequency'])
top_100_trigrams = trigrams_df['trigram'][:100]

**Removing documents from the Congressional Record with over 20% of trirgams that match the 100 most frequently used trigrams**


In [6]:
def procedural_proportion(doc, top_trigrams):
    doc_trigrams = list(nltk.trigrams(doc.split()))
    doc_trigrams = [' '.join(trigram) for trigram in doc_trigrams]
    procedural_count = sum(trigram in top_trigrams for trigram in doc_trigrams)
    if len(doc_trigrams) == 0:
        return 0
    return procedural_count / len(doc_trigrams)


top_100_trigrams_list = top_100_trigrams.tolist()
congressional_record['procedural_proportion'] = congressional_record['cleaned_tokens'].apply(
    lambda doc: procedural_proportion(doc, top_100_trigrams_list))

congressional_record = congressional_record[congressional_record['procedural_proportion'] < 0.2]