# Generate Tables of the Most Changed Tokens

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import re

import numpy as np
import pandas as pd
import plotnine as p9
import plydata as ply
import plydata.tidy as ply_tdy
import tqdm

In [2]:
subsetted_tokens = pd.read_csv("output/subsetted_tokens.tsv", sep="\t")
token_filter_list = subsetted_tokens.tok.tolist()
subsetted_tokens.head()

Unnamed: 0,tok
0,united
1,combining
2,purinergic
3,decomposition
4,60.0


In [3]:
distance_files = list(
    Path("output/combined_inter_intra_distances").rglob("saved_*_distance.tsv")
)
print(len(distance_files))

20


In [4]:
year_distance_map = {
    re.search(r"\d+", str(year_file)).group(0): (pd.read_csv(str(year_file), sep="\t"))
    for year_file in tqdm.tqdm(distance_files)
}

100%|██████████| 20/20 [00:02<00:00,  8.48it/s]


In [5]:
full_token_set_df = pd.concat(
    [
        year_distance_map[year] >> ply.query(f"tok in {token_filter_list}")
        # >>ply.query("year_2-year_1 == 1")
        >> ply.query("year_1 == 2000")
        for year in tqdm.tqdm(year_distance_map)
    ]
)
print(full_token_set_df.shape)
full_token_set_df.head()

100%|██████████| 20/20 [00:12<00:00,  1.58it/s]

(298840, 8)





Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
0,united,0.71087,0.737019,0.523925,0.924041,0.302456,2000,2017
1,combining,0.742832,0.86714,0.64414,0.997836,0.599636,2000,2017
2,purinergic,0.710384,0.652537,0.463552,0.930912,0.232051,2000,2017
3,decomposition,0.886063,0.776245,0.687802,0.966696,0.489906,2000,2017
4,60.0,0.783535,0.650033,0.509323,0.79493,0.376786,2000,2017


# Top Ten Words that has the Greatest Rate of Change in X time

## 20 Years

In [9]:
(
    full_token_set_df
    >> ply.query("year_2==2020")
    >> ply.arrange("-global_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
76308,evaluate,0.516028,0.93568,0.482837,0.998775,0.266206,2000,2020
82811,lead,0.597169,0.931792,0.556437,0.999522,0.46244,2000,2020
85871,provided,0.757996,0.928921,0.704118,0.999163,0.62159,2000,2020
81768,improve,0.499031,0.928311,0.463256,0.999111,0.243595,2000,2020
78573,analysis,0.519404,0.928257,0.482141,0.998019,0.30406,2000,2020
85062,aims,0.822279,0.928231,0.763265,0.99916,0.694036,2000,2020
86740,using,0.388449,0.927692,0.360361,0.997601,0.105174,2000,2020
76097,provides,0.581117,0.925621,0.537894,0.998985,0.365198,2000,2020
88715,used,0.353814,0.924825,0.327216,0.997529,0.085798,2000,2020
74868,assess,0.465595,0.92465,0.430512,0.997826,0.19838,2000,2020


In [10]:
(
    full_token_set_df
    >> ply.query("year_2==2020")
    >> ply.arrange("-original_global_distance")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
87260,reintroduction,1.172329,0.749579,0.878753,0.954574,1.3706,2000,2020
88498,authentic,1.163812,0.706494,0.822225,0.932284,1.457587,2000,2020
76956,reception,1.152074,0.743327,0.856367,0.974451,1.524101,2000,2020
87286,hsp90,1.129326,0.805121,0.909244,0.981617,1.392923,2000,2020
87033,galectin-1,1.124502,0.748855,0.842089,0.97034,1.219711,2000,2020
79700,modifiable,1.124492,0.783657,0.881217,0.975328,1.381839,2000,2020
87579,cephalosporin,1.118213,0.744903,0.83296,0.95852,1.271859,2000,2020
83615,302,1.108112,0.674808,0.747762,0.871606,1.055087,2000,2020
78811,warranting,1.100526,0.761106,0.837617,0.986673,1.277611,2000,2020
87869,theta,1.098731,0.800358,0.879378,0.976841,1.1602,2000,2020


In [11]:
(
    full_token_set_df
    >> ply.query("year_2==2020")
    >> ply.arrange("-global_times_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
84148,inspired,1.080367,0.852893,0.921437,0.996255,1.199118,2000,2020
75215,warts,1.09849,0.838215,0.920771,0.996381,1.21582,2000,2020
80787,warrant,1.033638,0.884563,0.914318,0.998616,1.10696,2000,2020
87286,hsp90,1.129326,0.805121,0.909244,0.981617,1.392923,2000,2020
80800,imbalances,1.076003,0.844781,0.908987,0.997302,1.16957,2000,2020
79268,chronological,1.094599,0.819024,0.896503,0.99475,1.21596,2000,2020
80446,polysaccharide,1.062721,0.836282,0.888735,0.994943,1.223538,2000,2020
78588,meanwhile,1.01927,0.871004,0.887789,0.998546,1.119941,2000,2020
87360,complicating,1.063107,0.833475,0.886073,0.99672,1.205275,2000,2020
79700,modifiable,1.124492,0.783657,0.881217,0.975328,1.381839,2000,2020


## 10 Years

In [12]:
(
    full_token_set_df
    >> ply.query("year_2==2010")
    >> ply.arrange("-global_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
217343,lead,0.590731,0.916682,0.541513,0.999438,0.439696,2000,2010
222199,aim,0.517852,0.913818,0.473223,0.997871,0.237291,2000,2010
209521,regulates,0.519855,0.912533,0.474385,0.998892,0.160646,2000,2010
210840,evaluate,0.469406,0.912293,0.428236,0.998406,0.21204,2000,2010
220403,provided,0.731986,0.903532,0.661373,0.998727,0.513804,2000,2010
212319,insights,0.696888,0.902792,0.629145,0.998179,0.412491,2000,2010
210629,provides,0.545296,0.902554,0.492159,0.99862,0.29946,2000,2010
216300,improve,0.468035,0.898707,0.420627,0.998611,0.18591,2000,2010
220666,overview,0.581666,0.897467,0.522026,0.997338,0.209528,2000,2010
209400,assess,0.446907,0.897432,0.401069,0.997332,0.170172,2000,2010


In [13]:
(
    full_token_set_df
    >> ply.query("year_2==2010")
    >> ply.arrange("-original_global_distance")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
223030,authentic,1.232631,0.699925,0.862749,0.934346,1.548255,2000,2010
222111,cephalosporin,1.171434,0.751192,0.879972,0.962236,1.410112,2000,2010
211488,reception,1.158616,0.72713,0.842465,0.972742,1.49947,2000,2010
213343,warranting,1.1586,0.738517,0.855646,0.985463,1.40941,2000,2010
218147,302,1.121088,0.649605,0.728264,0.873451,1.082113,2000,2010
213800,chronological,1.117529,0.790808,0.883751,0.99441,1.303684,2000,2010
214232,modifiable,1.115028,0.763488,0.851311,0.975392,1.398058,2000,2010
209747,warts,1.113944,0.835063,0.930213,0.99612,1.245652,2000,2010
216391,neurophysiological,1.104586,0.761018,0.84061,0.981287,1.113876,2000,2010
212113,cardiorespiratory,1.103337,0.724604,0.799483,0.974697,1.003954,2000,2010


In [14]:
(
    full_token_set_df
    >> ply.query("year_2==2010")
    >> ply.arrange("-global_times_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
209747,warts,1.113944,0.835063,0.930213,0.99612,1.245652,2000,2010
214978,polysaccharide,1.069924,0.844244,0.903277,0.994854,1.123478,2000,2010
216749,snp,1.079064,0.829347,0.894918,0.996658,1.177985,2000,2010
213800,chronological,1.117529,0.790808,0.883751,0.99441,1.303684,2000,2010
221111,engineer,1.009955,0.874552,0.883259,0.998931,0.990645,2000,2010
222111,cephalosporin,1.171434,0.751192,0.879972,0.962236,1.410112,2000,2010
218680,inspired,1.047393,0.836533,0.876179,0.99506,1.055819,2000,2010
219904,intrinsically,1.099165,0.793209,0.871868,0.993911,1.272971,2000,2010
220463,1q,1.064109,0.818579,0.871057,0.996815,1.025731,2000,2010
215085,urinary_tract_infections,1.092022,0.79138,0.864204,0.978088,1.197908,2000,2010


## 5 Years

In [15]:
(
    full_token_set_df
    >> ply.query("year_2==2005")
    >> ply.arrange("-global_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
269950,database,0.441206,0.920589,0.406169,0.998532,0.113409,2000,2005
271943,alcoholism,0.807027,0.916653,0.739764,0.998662,0.617099,2000,2005
278407,available,0.454164,0.908269,0.412503,0.997425,0.106403,2000,2005
271988,genes,0.3351,0.904822,0.303206,0.992618,0.033187,2000,2005
274706,2000,0.732539,0.904467,0.662558,0.996325,0.358943,2000,2005
281932,isolates,0.374527,0.903044,0.338215,0.996259,0.101537,2000,2005
277197,plant,0.54453,0.902609,0.491498,0.997452,0.204842,2000,2005
271860,sets,0.560821,0.901533,0.505599,0.997879,0.206959,2000,2005
282372,accuracy,0.423753,0.901511,0.382018,0.997362,0.129582,2000,2005
269117,quality,0.561085,0.900217,0.505098,0.994892,0.119734,2000,2005


In [16]:
(
    full_token_set_df
    >> ply.query("year_2==2005")
    >> ply.arrange("-original_global_distance")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
270534,rapidity,1.390827,0.832558,1.157944,0.993521,1.862461,2000,2005
271403,multisite,1.384575,0.797236,1.103833,0.984556,1.835448,2000,2005
273599,alleviation,1.371685,0.822157,1.12774,0.991879,1.846013,2000,2005
280086,crucially,1.365927,0.7974,1.089189,0.981266,1.698202,2000,2005
274717,temporarily,1.356807,0.785704,1.066049,0.979975,1.689101,2000,2005
277727,medicated,1.335899,0.758091,1.012733,0.960089,1.718998,2000,2005
276990,cam,1.327111,0.813342,1.079396,0.986237,1.673309,2000,2005
280192,portray,1.30974,0.783175,1.025755,0.983585,1.671946,2000,2005
270466,nn,1.293741,0.777745,1.0062,0.97131,1.590031,2000,2005
283004,escherichia_coli_k-12,1.284659,0.777089,0.998294,0.98662,1.749542,2000,2005


In [17]:
(
    full_token_set_df
    >> ply.query("year_2==2005")
    >> ply.arrange("-global_times_distance_qst")
    >> ply.slice_rows(10)
)

Unnamed: 0,tok,original_global_distance,global_distance_qst,global_times_distance_qst,local_distance_qst,local_times_distance_qst,year_1,year_2
270534,rapidity,1.390827,0.832558,1.157944,0.993521,1.862461,2000,2005
273599,alleviation,1.371685,0.822157,1.12774,0.991879,1.846013,2000,2005
271403,multisite,1.384575,0.797236,1.103833,0.984556,1.835448,2000,2005
280086,crucially,1.365927,0.7974,1.089189,0.981266,1.698202,2000,2005
276990,cam,1.327111,0.813342,1.079396,0.986237,1.673309,2000,2005
274717,temporarily,1.356807,0.785704,1.066049,0.979975,1.689101,2000,2005
279349,tremor,1.270109,0.808374,1.026723,0.9882,1.634273,2000,2005
280192,portray,1.30974,0.783175,1.025755,0.983585,1.671946,2000,2005
276541,snp,1.189061,0.859406,1.021886,0.997166,1.391027,2000,2005
277727,medicated,1.335899,0.758091,1.012733,0.960089,1.718998,2000,2005
