In [1]:
import json
from pathlib import Path
import re

from gensim.models import Word2Vec
import pandas as pd
import plydata as ply

In [2]:
data_folder = Path("data")
tok = "pandemic"

# Extract the frequencies

In [3]:
frequency_table = pd.read_csv(data_folder/Path("all_tok_frequencies.tsv.xz"), sep="\t")
frequency_table >> ply.slice_rows(5)

Unnamed: 0,tok,word_count,year,frequency
0,\,87283.0,2000.0,0.088504
1,the,38877.0,2000.0,0.039421
2,of,35677.0,2000.0,0.036176
3,",",33131.0,2000.0,0.033594
4,\.,32263.0,2000.0,0.032714


In [4]:
frequency_output_df = (
    frequency_table 
    >> ply.query("tok == @tok")
    >> ply.select("year", "frequency")
    >> ply.call(".astype", {"year": int})
)
frequency_output_df >> ply.slice_rows(5)

Unnamed: 0,year,frequency
13538,2000,4e-06
49880,2001,7e-06
98847,2002,5e-06
144813,2003,1.8e-05
195113,2004,3e-05


In [5]:
frequency_output = frequency_output_df >> ply.call('.to_dict', orient='records')
frequency_output

[{'year': 2000, 'frequency': 4.055943630495423e-06},
 {'year': 2001, 'frequency': 6.786389895065446e-06},
 {'year': 2002, 'frequency': 5.1355077072966025e-06},
 {'year': 2003, 'frequency': 1.778797305335538e-05},
 {'year': 2004, 'frequency': 3.0424223005751603e-05},
 {'year': 2005, 'frequency': 3.4601975511106074e-05},
 {'year': 2006, 'frequency': 5.7931767963692365e-05},
 {'year': 2007, 'frequency': 5.675180686513704e-05},
 {'year': 2008, 'frequency': 3.050017611075992e-05},
 {'year': 2009, 'frequency': 3.5469894145988345e-05},
 {'year': 2010, 'frequency': 5.734690646992077e-05},
 {'year': 2011, 'frequency': 5.58168691459375e-05},
 {'year': 2012, 'frequency': 3.0697905868055746e-05},
 {'year': 2013, 'frequency': 2.7573444518154736e-05},
 {'year': 2014, 'frequency': 1.6162967709091305e-05},
 {'year': 2015, 'frequency': 1.2524156165927838e-05},
 {'year': 2016, 'frequency': 1.0692881251864212e-05},
 {'year': 2017, 'frequency': 9.682144714067952e-06},
 {'year': 2018, 'frequency': 9.101056

# Extract Estimated Cutoff Points

In [6]:
cutoff_points = pd.read_csv(data_folder/Path("cusum_changepoint_abstracts.tsv"), sep="\t")
cutoff_points >> ply.slice_rows(5)

Unnamed: 0,tok,changepoint_idx,start_idx,end_idx,value
0,/17,2011-2012,2010-2011,2011-2012,6.923086
1,/17,2012-2013,2011-2012,2012-2013,-6.761565
2,/19,2010-2011,2009-2010,2010-2011,-5.562387
3,/19b,2019-2020,2018-2019,2019-2020,-10.187657
4,/2017,2019-2020,2018-2019,2019-2020,5.784376


In [7]:
changepoint_output = (
    cutoff_points 
    >> ply.query("tok == @tok")
    >> ply.select("changepoint_idx")
    >> ply.call('.to_dict', orient="records")
)
changepoint_output

[{'changepoint_idx': '2019-2020'}]

# Extract the neighbors

In [8]:
word_models = list((data_folder/Path("word2vec_models")).rglob("*/*model"))
word_models[0:2]

[PosixPath('data/word2vec_models/2003/2003_0.model'),
 PosixPath('data/word2vec_models/2010/2010_0.model')]

In [9]:
word_model_map = dict()
for word_model in word_models:
    match_obj = re.search(r"(\d+)_(\d).model", str(word_model))

    year = int(match_obj.group(1))
    if year not in word_model_map:
        word_model_map[year] = list()

    word_model_map[year].append(str(word_model))

In [10]:
word_model_loaded_map = {
    key: Word2Vec.load(sorted(word_model_map[key])[0]) for key in word_model_map
}

In [11]:
word_freq_count_cutoff = 30

In [12]:
word_model_cutoff_map = {
    key: {
        "model": word_model_loaded_map[key],
        "cutoff_index": min(
            map(
                lambda x: 999999
                if word_model_loaded_map[key].wv.get_vecattr(x[1], "count")
                > word_freq_count_cutoff
                else x[0],
                enumerate(word_model_loaded_map[key].wv.index_to_key),
            )
        ),
    }
    for key in word_model_loaded_map
}

In [13]:
neighbors = 25
word_neighbor_map = dict()
for year in word_model_cutoff_map:
    
    # Check to see if token is in the vocab
    vocab = list(word_model_cutoff_map[year]["model"].wv.key_to_index.keys())
    if tok in vocab:
        
        # If it is grab the neighbors
        # Gensim needs to be > 4.0 as they enabled neighbor clipping (remove words from entire vocab)
        word_neighbors = word_model_cutoff_map[year]["model"].wv.most_similar(
            tok, topn=neighbors, clip_end=word_model_cutoff_map[year]["cutoff_index"]
        )
        
        # Append neighbor to word_neighbor_map
        for neighbor in word_neighbors:
            if year not in word_neighbor_map:
                word_neighbor_map[year] = list()

            word_neighbor_map[year].append(neighbor[0])

In [14]:
word_neighbor_map[2000]

['introduction',
 'pathogen',
 'isolation',
 'genomics',
 'materials',
 'objectives',
 'aims',
 'sets',
 'toxins',
 'numerical',
 'background',
 'escherichia_coli',
 'transmission',
 'saudi',
 'middle',
 'profiles',
 'plant',
 'comparative',
 'commercial',
 'debate',
 'give',
 'theory',
 'discussion',
 'center',
 'framework']

# Final Return Object

This object doesn't contain the umap plot needed for visualization.
On my todolist of things to get done.

In [15]:
api_return_obj = {
    "neighbors": word_neighbor_map,
    "frequency": frequency_output,
    "changepoints":changepoint_output,
}
json.dump(api_return_obj, open("../pandemic_demo.json", "w"))