# Dictionary Expansion


This notebook uses GloVe embeddings created by [Rodrigues and Spirling (2022)](https://github.com/prodriguezsosa/EmbeddingsPaperReplication?tab=readme-ov-file) to expand the `shortened_congressional_record_procedural_stems` and `shortened_hansard_procedural_stems` dictionaries.


## Setup


In [32]:
import re
import pandas as pd
import pyreadr
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

DATA_PATH = 'data/'
DIST_PATH = 'dist/'

shortened_congressional_record_procedural_stems = pd.read_csv(
    DIST_PATH + 'shortened_congressional_record_procedural_stems.csv')

## Expanding the `shortened_congressional_record_procedural_stems` dictionary


### Loading Rodriguez and Spirling's (2022) Congressional Record embeddings


In [26]:
cr_embeddings = pyreadr.read_r(DATA_PATH + 'cr_embeddings_12_300_1.rds')[None]

### Ensuring that the embeddings have been tokenized and stemmed


In [27]:
regex = re.compile(r'[^a-z\s]')
stemmer = PorterStemmer()


def clean_stem(word):
    """Cleans and stems a given word."""
    if not isinstance(word, str):
        word = str(word)
    word = word.lower()
    word = regex.sub('', word)
    stemmed_word = stemmer.stem(word)
    return stemmed_word


cr_embeddings.index = cr_embeddings.index.map(clean_stem)

### Calculating the mean embedding for the `shortened_congressional_record_procedural_stems` dictionary


In [36]:
embedded_cr_stems = shortened_congressional_record_procedural_stems.merge(
    cr_embeddings, left_on='stem', right_index=True, how='inner').drop_duplicates(subset='stem').set_index('stem')

embedded_cr_stems.columns = range(embedded_cr_stems.shape[1])

mean_cr_procedural_embedding = embedded_cr_stems.mean()

### Calculating the cosine similarity between the `shortened_congressional_record_procedural_stems` dictionary and the GloVe embeddings


In [37]:
cr_procedural_cosine_similarities = cosine_similarity(
    cr_embeddings, [mean_cr_procedural_embedding]).flatten()

cr_procedural_cosine_similarities = pd.Series(
    cr_procedural_cosine_similarities, index=cr_embeddings.index).sort_values(ascending=False)