# Look into changepoints detected by CUSUM

In [1]:
import itertools
from pathlib import Path

from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import plydata as ply
import tqdm

from detecta import detect_cusum

from biovectors_modules.plot_helper import deidentify_concepts, generate_neighbor_table

In [2]:
output_file_folder = Path("output/figure_data_and_figures/neighbor_tables")

In [3]:
word2vec_model_list = sorted(
    list(Path("output/models").rglob("*0_fulltext.model")),
    key=lambda x: int(x.stem.split("_")[0]),
)
print(word2vec_model_list[0:2])

[PosixPath('output/models/2000_model/2000_0_fulltext.model'), PosixPath('output/models/2001_model/2001_0_fulltext.model')]


In [4]:
concepts_df = pd.read_csv(
    "../concept_mapper_experiment/output/all_concept_ids.tsv.xz", sep="\t"
)
lower_case_concept_id = list(map(lambda x: x.lower(), concepts_df.concept_id.tolist()))
concept_mapper = dict(zip(lower_case_concept_id, concepts_df.concept.tolist()))
concepts_df >> ply.slice_rows(10)

Unnamed: 0,concept_id,concept
0,mesh_d000001,calcimycin
1,mesh_d000002,temefos
2,mesh_d000003,abattoirs
3,mesh_d000004,abbreviations as topic
4,mesh_d000005,abdomen
5,mesh_d000006,"abdomen, acute"
6,mesh_d000007,abdominal injuries
7,mesh_d000008,abdominal neoplasms
8,mesh_d000009,abdominal muscles
9,mesh_d000010,abducens nerve


In [5]:
changepoints_df = pd.read_csv("output/pubtator_changepoints.tsv", sep="\t")
changepoints_df

Unnamed: 0,tok,changepoint_idx,start_idx,end_idx,value
0,species_2697049,2019-2020,2008-2009,2019-2020,1894.444406
1,gene_43740578,2019-2020,2016-2017,2019-2020,1616.904268
2,lockdown,2019-2020,2012-2013,2019-2020,589.494584
3,species_333278,2012-2013,2011-2012,2012-2013,533.319955
4,typeset,2007-2008,2006-2007,2007-2008,453.520680
...,...,...,...,...,...
44835,algan,2010-2011,2009-2010,2010-2011,8.815154
44836,gene_100314169,2019-2020,2018-2019,2019-2020,8.815005
44837,gene_55915,2020-2021,2014-2015,2020-2021,8.814831
44838,sadrs,2012-2013,2009-2010,2012-2013,8.814770


In [6]:
query = "reviewer(s"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [04:43<00:00, 12.91s/it]


            tok changepoint_idx  start_idx    end_idx     value
105  reviewer(s       2018-2019  2017-2018  2018-2019  65.54713


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2013,coauthore,favazza,coauthored,unf,jmcr,crommelin,refereed,wrote,reports,naish
2014,uavi,csbj,7483,4150,n=265,mdoc,1328,392582,"auc(0,2.5h",ipdma
2015,biostatistician,statistician,duly,electroencephalographer,proofreader,informatician,5.0.24,electrocardiographer,adjudication,abnm
2016,sundquist,monaci,j.-j,mirjana,y.-c.,marcelo,fursch,svetlana,konstantinos,mehrabi
2017,heiko,irina,zachary,cynthia,kenneth,johanna,kristin,nicholas,christina,vanessa
2018,matthias,maryam,katharina,mohamed,jens,cristina,evgeny,dmitry,tatiana,juliane
2019,communications,thanks,suppiah,michal,friederike,katharina,melanie,yoshihiro,cristian,kerstin
2020,communications,informationnature,thanks,yukiko,koichiro,aurelien,verena,ionut,maziar,wataru
2021,informationnature,communications,thanks,informationcommunication,justyna,kerstin,dorothee,mickael,mihaela,alberto


In [7]:
query = "medrxiv"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    output_file_folder=output_file_folder,
    save_file=True,
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [03:11<00:00,  8.69s/it]


        tok changepoint_idx  start_idx    end_idx      value
89  medrxiv       2019-2020  2018-2019  2019-2020  69.958309


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2018,preprint,biorxiv,hackathon,ajhp,directorship,tuveson,agbiodata,hhmi,defunct,foldit
2019,informatics,guardant,arxiv,allina,avalere,f1000research,f1000,uconn,elsevi,omada
2020,biorxiv,medrxiv.org,preprint,arxiv,chemrxiv,biorxiv.org,https://www.medrxiv.org/,chinaxiv,researchgate,litcovid
2021,biorxiv,arxiv,preprint,medrxiv.org,litcovid,psyarxiv,chemrxiv,ssrn,biorxiv.org,https://www.medrxiv.org/


In [8]:
query = "pandemic"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    output_file_folder=output_file_folder,
    save_file=True,
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [01:52<00:00,  5.11s/it]


           tok changepoint_idx  start_idx    end_idx      value
3697  pandemic       2019-2020  2011-2012  2019-2020  24.110216


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2000,epidemic,epizootic,influenza b virus (species_11520),h3n2 subtype (species_119210),1918,zoonosis,h1n1 subtype (species_114727),japanese encephalitis virus (species_11072),outbreak,poliomyelitis (mesh_d011051)
2001,epidemic,poliomyelitis (mesh_d011051),variola virus (species_10255),influenza a virus (species_11320),plague,epizootic,h3n2 subtype (species_119210),polio,1918,"encephalitis, arbovirus (mesh_d004671)"
2002,epidemic,variola virus (species_10255),1918,h5n1 subtype (species_102793),epizootic,resurgence,influenza a virus (species_11320),plague,flavivirus,h1n1 subtype (species_114727)
2003,epidemic,influenza b virus (species_11520),h5n1 subtype (species_102793),1918,reemergence,outbreak,variola,bioterrorism,h3n2 subtype (species_119210),influenza a virus (species_11320)
2004,epidemic,1918,variola virus (species_10255),influenza a virus (a/wsn/1933(h1n1)) (species_...,h5n1 subtype (species_102793),reemergence,outbreak,unidentified influenza virus (species_11309),influenza a virus (species_11320),plague
2005,epidemic,flu,variola virus (species_10255),outbreak,unidentified influenza virus (species_11309),bioterrorism,h5n1 subtype (species_102793),1918,influenza a virus (species_11320),influenza
2006,epidemic,outbreak,influenza a virus (species_11320),unidentified influenza virus (species_11309),SARS1 (gene_6301),disaster,influenza,1918,flu,variola virus (species_10255)
2007,epidemic,outbreak,influenza,bioterrorism,influenza a virus (species_11320),h5n1 subtype (species_102793),severe acute respiratory syndrome (mesh_d045169),unidentified influenza virus (species_11309),disaster,flu
2008,epidemic,outbreak,influenza,variola virus (species_10255),disaster,severe acute respiratory syndrome (mesh_d045169),influenza a virus (species_11320),unidentified influenza virus (species_11309),bioterrorism,1918
2009,epidemic,outbreak,influenza,h1n1 subtype (species_114727),influenza a virus (species_11320),unidentified influenza virus (species_11309),severe acute respiratory syndrome (mesh_d045169),disaster,"influenza, human (mesh_d007251)",variola virus (species_10255)


In [9]:
query = "cas9"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    output_file_folder=output_file_folder,
    save_file=True,
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [01:32<00:00,  4.20s/it]


      tok changepoint_idx  start_idx    end_idx      value
188  cas9       2012-2013  2011-2012  2012-2013  56.171279


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2011,cas2,crispr3,cas3,cas1,cse3,cas4,csn1,crispr1,csn2,crispr
2012,cas2,crispr1,cas3,cas1,cas10,crispr3,tracrrna,crispr,csn1,crispr4
2013,sgrna,talen,spcas9,zfns,grna,zfn,dcas9,nickase,pcocas9,crispr
2014,talen,sgrna,grna,zfn,dcas9,zfns,crispr,rgen,grnas,sgrnas
2015,talen,sgrna,zfn,zfns,grna,crispr,spcas9,dcas9,cas9n,sgrnas
2016,crispr,talen,sgrna,grna,zfn,cripsr,sacas9,spcas9,zfns,Cnbd2 (gene_70873)
2017,spcas9,crispr,sacas9,talen,sgrna,grna,Cnbd2 (gene_70873),gene_46806597,cas9n,cripsr
2018,crispr,sgrna,grna,talen,gene_46806597,cripsr,spcas9,cas9n,Cnbd2 (gene_70873),zfn
2019,gene_46806597,crispr,cripsr,grna,Cnbd2 (gene_70873),cas9n,sgrna,zfn,talen,sacas9
2020,gene_46806597,crispr,gene_52200466,Cnbd2 (gene_70873),grna,cripsr,zfn,sgrna,cas-9,talen


In [10]:
query = "cellline_cvcl_1698"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [01:23<00:00,  3.81s/it]


Empty DataFrame
Columns: [tok, changepoint_idx, start_idx, end_idx, value]
Index: []


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2008,snmg,mglow,mghigh,leite,jo-1,meibomian gland dysfunction (mesh_d000080343),eqigg4,myasthenogenic,achr,myasthenia gravis (mesh_d009157)
2009,myasthenia gravis (mesh_d009157),MUSK (gene_4593),thymomatous,irmd,achr,achr+,evoli,mg/25,u-87,edrophonium (mesh_d004491)
2010,lghmf,tcid50/100,d2159,product_n,240/80,qol15,300/25,u-251,2x1000,135k/25na/50hepes/10
2011,heteromphrale blanca (species_1611588),morrocoyensis,erythrothalassia,levantia,iranotrichia insolita (species_1611591),strigocis vicosensis (species_1541867),domitius baeticus (species_2175142),asiphonipponaphis,glabriusculus,dasyscutum
2012,u251/,u251,LN-229 (cellline_cvcl_0393),uw228,osc-19,u138,a172,skov3,LoVo (cellline_cvcl_0399),MTRNR2L5 (gene_100463289)
2013,IGHG3 (gene_3502),achr,iggs,predsg1,IGG (gene_102658792),MUSK (gene_4593),igg2,Ighv1-62 (gene_668542),sera,-colocalize
2014,Musk (gene_18198),dysferlinopathy (mesh_c537995),meibomian gland dysfunction (mesh_d000080343),LRP4 (gene_4038),"myasthenic syndromes, congenital (mesh_d020294)",muscular dystrophies (mesh_d009136),"polyradiculoneuropathy, chronic inflammatory d...",Dok7 (gene_231134),Chrna2 (gene_170945),"muscular dystrophy, emery-dreifuss (mesh_d020389)"
2015,MUSK (gene_4593),snmg,myasthenia gravis (mesh_d009157),achr,"polyradiculoneuropathy, chronic inflammatory d...",meibomian gland dysfunction (mesh_d000080343),LRP4 (gene_4038),eamg,Musk (gene_18198),RAPSN (gene_5913)
2016,achr,MUSK (gene_4593),LRP4 (gene_4038),Chrna2 (gene_170945),myasthenia gravis (mesh_d009157),meibomian gland dysfunction (mesh_d000080343),Musk (gene_81725),"myasthenic syndromes, congenital (mesh_d020294)",thymus hyperplasia (mesh_d013952),IGHG3 (gene_3502)
2017,achr,MUSK (gene_4593),myasthenia gravis (mesh_d009157),qol15,resistanceplus,iv-150,mir90,wt(3,pogue)(nmnh,wt(2
