# Hansard Procedural Stems Creation


This notebook scrapes the [online index of Erskine May]('https://erskinemay.parliament.uk/browse/indexterms?page=1') to create a list of procedural stems used in the UK Parliament.


## Setup


In [1]:
import os
import ssl
import re
import nltk
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

DATA_PATH = 'data/'
DIST_PATH = 'dist/'
HANSARD_PATH = '../../hansard-in-full/'

YEAR_RANGE = (1997, 2015)

# Ignore SSL certificate errors
ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felixwallis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Extracting parliamentary procedural terms from the online index of Erskine May


In [2]:
def extract_terms(html_file_path, filename):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file.read(), 'html.parser')

    index_terms = soup.find_all('span', class_='text')
    return [(term.get_text(strip=True), filename) for term in index_terms]


def extract_terms_from_files(directory):
    terms = []
    files = os.listdir(directory)
    for filename in files:
        if filename.endswith(".html"):
            print(f'Extracting terms from {filename}...')
            file_path = os.path.join(directory, filename)
            terms.extend(extract_terms(file_path, filename))
    return terms


directory = DATA_PATH + 'erskine-may-index/'
index_terms = extract_terms_from_files(directory)
index_terms_df = pd.DataFrame(index_terms, columns=['term', 'source_file'])

Extracting terms from 23.html...
Extracting terms from 35.html...
Extracting terms from 9.html...
Extracting terms from 19.html...
Extracting terms from 39.html...
Extracting terms from 5.html...
Extracting terms from 15.html...
Extracting terms from 42.html...
Extracting terms from 54.html...
Extracting terms from 43.html...
Extracting terms from 14.html...
Extracting terms from 4.html...
Extracting terms from 38.html...
Extracting terms from 18.html...
Extracting terms from 8.html...
Extracting terms from 34.html...
Extracting terms from 22.html...
Extracting terms from 29.html...
Extracting terms from 3.html...
Extracting terms from 13.html...
Extracting terms from 44.html...
Extracting terms from 52.html...
Extracting terms from 25.html...
Extracting terms from 33.html...
Extracting terms from 48.html...
Extracting terms from 49.html...
Extracting terms from 32.html...
Extracting terms from 24.html...
Extracting terms from 53.html...
Extracting terms from 45.html...
Extracting term

## Turning the procedural terms into a dictionary of stemmed unique unigrams


### Preprocessing function for the procedural terms


In [3]:
stopwords = set(stopwords.words('english'))


def clean_tokenize(text):
    # Text should almost always be a string, but we check
    # just in case
    if not isinstance(text, str):
        text = str(text)
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stopwords]
    # Stem the tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

### Tokenizing and stemming the procedural terms


In [4]:
index_terms_df['cleaned_term'] = index_terms_df['term'].apply(clean_tokenize)
exploded_index_terms_df = index_terms_df.explode('cleaned_term')
unique_terms = exploded_index_terms_df['cleaned_term'].unique()
unique_terms_df = pd.DataFrame(
    unique_terms, columns=['stem']).sort_values(by='stem').reset_index(drop=True)

unique_terms_df.to_csv(
    DATA_PATH + 'hansard_procedural_stems.csv', index=False)

The dictionary is manually cleaned at this point to create the final [`shortened_hansard_procedural_terms.csv` file](https://docs.google.com/spreadsheets/d/1twVZ_ypcBOLroMDxgbC0veFKvHq7BbT9HbW99zUnNU8/edit?usp=sharing).


In [5]:
unique_terms_df.shape

(751, 1)

## Adding stems that are frequently used by the Speaker of the House of Commons to the dictionary


### Preprocessing functions for Procedural Hansard


In [20]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def tokenize_and_stem(text):
    # Text should almost always be a string, but we check just in case
    if not isinstance(text, str):
        text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, and symbols
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stem the tokens
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Rejoin the stemmed tokens
    joined_stems = ' '.join(stemmed_tokens)
    return joined_stems


def corpus_preprocessing(corpus: pd.DataFrame,
                         text_column_name: str,
                         year_range: tuple,
                         year_column_name: str = 'year'):
    corpus = corpus.copy()
    # Remove corpus content from outside the year range
    corpus = corpus[corpus[year_column_name].between(
        year_range[0], year_range[1])]
    # Clean, tokenize, and stem the corpus
    tqdm.pandas(desc="Processing Text")
    corpus['cleaned_stems'] = corpus[text_column_name].progress_apply(
        tokenize_and_stem)
    # Remove any documents with fewer than 10 stems
    corpus['stem_count'] = corpus['cleaned_stems'].apply(
        lambda x: len(x.split()))
    corpus = corpus[corpus['stem_count'] >= 10]
    return corpus

### Procedural Hansard preprocessing


**Loading Hansard**


In [21]:
hansard = pd.read_csv(HANSARD_PATH + 'hansard_with_mp_details.csv')

**Creating Procedural Hansard**


In [22]:
hansard['speech_date'] = pd.to_datetime(hansard['speech_date'])
hansard['year'] = hansard['speech_date'].dt.year

procedural_hansard = hansard[hansard['memberships'].isna()]
procedural_hansard = corpus_preprocessing(
    procedural_hansard, 'text', YEAR_RANGE)

Processing Text: 100%|██████████| 15242/15242 [00:02<00:00, 5694.51it/s] 


### Finding the most frequently used stems in the Procedural Hansard


**Turning the procedural Hansard into a document-feature matrix**


In [23]:
vectorizer = CountVectorizer()

procedural_hansard_dfm = vectorizer.fit_transform(
    procedural_hansard['cleaned_stems'])
procedural_hansard_dfm_df = pd.DataFrame(
    procedural_hansard_dfm.toarray(), columns=vectorizer.get_feature_names_out())

**Finding the most frequently used stems**


In [24]:
stem_counts = procedural_hansard_dfm_df.sum().sort_values(ascending=False)
top_stems = stem_counts.head(100)
top_stems

hon       3288
member    2214
amend     1933
would     1730
line      1580
          ... 
believ     330
polic      322
place      321
deal       320
like       318
Length: 100, dtype: int64

### Adding the most frequently used stems to the procedural stems dictionary if they have not already been added


In [25]:
hansard_procedural_stems = pd.read_csv(
    DIST_PATH + 'shortened_hansard_procedural_stems.csv')
hansard_procedural_stems = set(hansard_procedural_stems['stem'].tolist())

top_stems_set = set(top_stems.index)
missing_stems = top_stems_set.difference(hansard_procedural_stems)

hansard_procedural_stems = hansard_procedural_stems.union(missing_stems)
hansard_procedural_stems_df = pd.DataFrame(
    list(hansard_procedural_stems), columns=['stem'])
hansard_procedural_stems_df.to_csv(
    DIST_PATH + 'expanded_hansard_procedural_stems.csv', index=False)