# Dictionary Expansion


This notebook uses GloVe embeddings created by [Rodrigues and Spirling (2022)](https://github.com/prodriguezsosa/EmbeddingsPaperReplication?tab=readme-ov-file) to expand the `shortened_congressional_record_procedural_stems` and `shortened_hansard_procedural_stems` dictionaries.


## Setup


In [10]:
import re
import pandas as pd
import pyreadr
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

DATA_PATH = 'data/'
DIST_PATH = 'dist/'

shortened_congressional_record_procedural_stems = pd.read_csv(
    DIST_PATH + 'shortened_congressional_record_procedural_stems.csv')
shortened_hansard_procedural_stems = pd.read_csv(
    DIST_PATH + 'shortened_hansard_procedural_stems.csv')

### Utility function for ensuring that embeddings have been stemmed


In [7]:
regex = re.compile(r'[^a-z\s]')
stemmer = PorterStemmer()


def clean_stem(word):
    """Cleans and stems a given word."""
    if not isinstance(word, str):
        word = str(word)
    word = word.lower()
    word = regex.sub('', word)
    stemmed_word = stemmer.stem(word)
    return stemmed_word

## Expanding the `shortened_congressional_record_procedural_stems` dictionary


### Loading Rodriguez and Spirling's (2022) Congressional Record embeddings


In [14]:
congressional_record_embeddings = pyreadr.read_r(
    DATA_PATH + 'congressional_record_embeddings_12_300_1.rds')[None]
congressional_record_embeddings.index = congressional_record_embeddings.index.map(
    clean_stem)

### Calculating the mean embedding for the `shortened_congressional_record_procedural_stems` dictionary


In [17]:
embedded_congressional_record_stems = shortened_congressional_record_procedural_stems.merge(
    congressional_record_embeddings, left_on='stem', right_index=True, how='inner').drop_duplicates(subset='stem').set_index('stem')

mean_congressional_record_procedural_embedding = embedded_congressional_record_stems.mean()

### Calculating the cosine similarity between the `shortened_congressional_record_procedural_stems` dictionary and the GloVe embeddings


In [5]:
congressional_record_procedural_cosine_similarities = cosine_similarity(
    congressional_record_embeddings, [mean_congressional_record_procedural_embedding]).flatten()

congressional_record_procedural_cosine_similarities = pd.Series(
    congressional_record_procedural_cosine_similarities, index=congressional_record_embeddings.index).sort_values(ascending=False)

## Expanding the `shortened_hansard_procedural_stems` dictionary


### Loading Rodriguez and Spirling's (2022) Hansard embeddings


In [8]:
hansard_embeddings = pyreadr.read_r(
    DATA_PATH + 'hansard_embeddings_12_300_1.rds')[None]
hansard_embeddings.index = hansard_embeddings.index.map(clean_stem)

### Calculating the mean embedding for the `shortened_hansard_procedural_stems` dictionary


In [19]:
embedded_hansard_stems = shortened_hansard_procedural_stems.merge(
    hansard_embeddings, left_on='stem', right_index=True, how='inner').drop_duplicates(subset='stem').set_index('stem')

mean_hansard_procedural_embedding = embedded_hansard_stems.mean()

### Calculating the cosine similarity between the `shortened_hansard_procedural_stems` dictionary and the GloVe embeddings


In [20]:
hansard_procedural_cosine_similarities = cosine_similarity(
    hansard_embeddings, [mean_hansard_procedural_embedding]).flatten()

hansard_procedural_cosine_similarities = pd.Series(
    hansard_procedural_cosine_similarities, index=hansard_embeddings.index).sort_values(ascending=False)

In [31]:
filtered_series = hansard_procedural_cosine_similarities.loc[
    hansard_procedural_cosine_similarities.index == 'pollut']