# Look into changepoints detected by CUSUM

In [1]:
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
import plydata as ply
import tqdm

from detecta import detect_cusum

from biovectors_modules.plot_helper import deidentify_concepts, generate_neighbor_table

In [2]:
output_file_folder = Path("output/figure_data_and_figures/neighbor_tables")

In [3]:
word2vec_model_list = sorted(
    list(Path("output/trained_models").rglob("*0_fulltext.model")),
    key=lambda x: int(x.stem.split("_")[0]),
)
print(word2vec_model_list[0:2])

[PosixPath('output/trained_models/2015_model/2015_0_fulltext.model'), PosixPath('output/trained_models/2016_model/2016_0_fulltext.model')]


In [4]:
concepts_df = pd.read_csv(
    "../concept_mapper_experiment/output/all_concept_ids.tsv.xz", sep="\t"
)
lower_case_concept_id = list(map(lambda x: x.lower(), concepts_df.concept_id.tolist()))
concept_mapper = dict(zip(lower_case_concept_id, concepts_df.concept.tolist()))
concepts_df >> ply.slice_rows(10)

Unnamed: 0,concept_id,concept
0,mesh_d000001,calcimycin
1,mesh_d000002,temefos
2,mesh_d000003,abattoirs
3,mesh_d000004,abbreviations as topic
4,mesh_d000005,abdomen
5,mesh_d000006,"abdomen, acute"
6,mesh_d000007,abdominal injuries
7,mesh_d000008,abdominal neoplasms
8,mesh_d000009,abdominal muscles
9,mesh_d000010,abducens nerve


In [5]:
changepoints_df = pd.read_csv("output/biorxiv_changepoints.tsv", sep="\t")
changepoints_df

Unnamed: 0,tok,changepoint_idx,start_idx,end_idx,value
0,lockdown,2019-2020,2018-2019,2019-2020,1571.734315
1,distancing,2019-2020,2018-2019,2019-2020,951.736126
2,disease_mesh_d045169,2019-2020,2017-2018,2019-2020,341.784167
3,species_227859,2019-2020,2017-2018,2019-2020,281.494336
4,hcq,2019-2020,2018-2019,2019-2020,225.013638
...,...,...,...,...,...
2267,rap2,2020-2021,2019-2020,2020-2021,15.172122
2268,gene_17874,2018-2019,2017-2018,2018-2019,15.169693
2269,ss14,2020-2021,2019-2020,2020-2021,15.163931
2270,clan,2018-2019,2017-2018,2018-2019,15.163350


In [6]:
query = "hydroxychloroquine"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 8/8 [00:10<00:00,  1.30s/it]

                  tok changepoint_idx  start_idx    end_idx       value
9  hydroxychloroquine       2019-2020  2018-2019  2019-2020  160.900513





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2018,pharmaceutical,qn,gemcitabine,hcq,vatg-027,anticancer,aminoquinoline,olanzapine,substance,atorvastatin
2019,hcq,ritonavir,emtricitabine,mofetil,mycophenolate,immunosuppressant,lopinavir,everolimus,azathioprine,prednisone
2020,hcq,lopinavir,ritonavir,chloroquine,azithromycin,arbidol,favipiravir,remdesivir,tcz,corticosteroid
2021,lopinavir,ritonavir,azithromycin,ribavirin,remdesivir,hcq,baricitinib,favipiravir,corticosteroid,fluvoxamine
2022,lopinavir,ritonavir,azithromycin,favipiravir,tocilizumab,chloroquine,remdesivir,mofetil,corticosteroid,sirolimus


In [7]:
query = "gedi"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 8/8 [00:09<00:00,  1.21s/it]

     tok changepoint_idx  start_idx    end_idx       value
13  gedi       2020-2021  2019-2020  2020-2021  123.188439





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2017,sang,sanger,ceph,hgr1,asxl1,ellopostomidae,glossina,macrogen,r140q,wes
2019,ssea-5,siq,vlsdi,no.33,rmi,-on,micrografte,fluidic,microfluidics,bgiseq500
2020,ogi1819_3159,ambry,deciphering,invitae,zam,egdp,gahg,barcodesnp,icog,ungenotyped
2021,rgedi,ddaf,ddaa,superhuman,entireaxon,neuritenet,ekar,gc150,gradcam,sr4vh
2022,phenotype-3,phenotype-2,holk,0.05/10,∗∗,pkup,0.0177,-way,-p<0.01,∗∗p


In [8]:
query = "postdoc"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 8/8 [00:09<00:00,  1.22s/it]

        tok changepoint_idx  start_idx    end_idx      value
70  postdoc       2016-2017  2015-2016  2016-2017  55.010458





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2015,graduate,journal,career,reviewer,publishing,arxiv,postdoctoral,investigator,scientist,trainee
2016,award,grant,preprint,college,graduate,faculty,professor,your,you,november
2017,institution,gss,science,1980,government,professional,trustworthiness,academy,journal,publishing
2018,mentor,postdoctoral,staff,academic,mentorship,trainee,graduate,faculty,funding,professional
2019,graduate,mentor,fellowship,trainee,professor,award,alumnus,applicant,faculty,lecturer
2020,postdoctoral,stipend,scholarship,fellowship,wage,stufe,phd,graduation,trainee,christian
2021,career,trainee,alumnus,postdoctoral,academic,respondent,doctoral,applicant,faculty,employer
2022,achievement,academic,faculty,academia,mistreatment,assistant,ophthalmology,graduate,career,campus


In [9]:
query = "mir-130a"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 8/8 [00:09<00:00,  1.18s/it]

         tok changepoint_idx  start_idx    end_idx      value
88  mir-130a       2020-2021  2019-2020  2020-2021  51.811613





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2019,mir-33,mir-10b,Mir497 (gene_751537),soat1,elavl1,as1,mir-27b,gadd45b,mir-,foxo3a
2020,cruzain,mz31,DHODH (gene_1723),mmu-,Mir762 (gene_791073),gk921,mir-762,SLC5A2 (gene_6524),multikinase,PDE4A (gene_5141)
2021,tbl1xr1,t(8;21,-130a,130a,aml1,cbfβ,eto,runx1,hottip,molm13


In [10]:
query = "ecuador"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 8/8 [00:09<00:00,  1.22s/it]

         tok changepoint_idx  start_idx    end_idx      value
141  ecuador       2016-2017  2015-2016  2016-2017  44.907687





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2015,peru,bolivia,peninsula,iran,southwest,denmark,china,brazil,sweden,province
2016,rica,costa,goniomma,toscani,deromyrma,norway,p.p,oxyopomyrmex,panama,southwest
2017,brazil,peru,colombia,province,china,machala,brazilian,america,city,latin
2018,colombia,peru,mexico,venezuela,argentina,chile,southeastern,northeast,bolivia,northwestern
2019,peru,argentina,colombia,kenya,senegal,panama,thailand,province,greece,venezuela
2020,peru,colombia,argentina,bolivia,venezuela,chile,uruguay,panama,northeastern,brazil
2021,colombia,peru,venezuela,argentina,egypt,gabon,cameroon,paraguay,uruguay,tunisia
2022,colombia,peru,venezuela,ghana,malaysia,kenya,greece,guatemala,panama,indonesia
