# Look into changepoints detected by CUSUM

In [1]:
import itertools
from pathlib import Path

from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import plydata as ply
import tqdm

from detecta import detect_cusum

from biovectors_modules.plot_helper import deidentify_concepts, generate_neighbor_table

In [2]:
output_file_folder = Path("output/figure_data_and_figures/neighbor_tables")

In [3]:
word2vec_model_list = sorted(
    list(Path("output/models").rglob("*0_fulltext.model")),
    key=lambda x: int(x.stem.split("_")[0]),
)
print(word2vec_model_list[0:2])

[PosixPath('output/models/2000_model/2000_0_fulltext.model'), PosixPath('output/models/2001_model/2001_0_fulltext.model')]


In [4]:
concepts_df = pd.read_csv(
    "../concept_mapper_experiment/output/all_concept_ids.tsv.xz", sep="\t"
)
lower_case_concept_id = list(map(lambda x: x.lower(), concepts_df.concept_id.tolist()))
concept_mapper = dict(zip(lower_case_concept_id, concepts_df.concept.tolist()))
concepts_df >> ply.slice_rows(10)

Unnamed: 0,concept_id,concept
0,mesh_d000001,calcimycin
1,mesh_d000002,temefos
2,mesh_d000003,abattoirs
3,mesh_d000004,abbreviations as topic
4,mesh_d000005,abdomen
5,mesh_d000006,"abdomen, acute"
6,mesh_d000007,abdominal injuries
7,mesh_d000008,abdominal neoplasms
8,mesh_d000009,abdominal muscles
9,mesh_d000010,abducens nerve


In [5]:
changepoints_df = pd.read_csv("output/pubtator_updated_changepoints.tsv", sep="\t")
changepoints_df

Unnamed: 0,tok,changepoint_idx,start_idx,end_idx,value
0,species_2697049,2019-2020,2008-2009,2019-2020,1894.444406
1,gene_43740578,2019-2020,2016-2017,2019-2020,1616.904268
2,lockdown,2019-2020,2012-2013,2019-2020,589.494584
3,species_333278,2012-2013,2011-2012,2012-2013,533.319955
4,typeset,2007-2008,2006-2007,2007-2008,453.520680
...,...,...,...,...,...
44835,algan,2010-2011,2009-2010,2010-2011,8.815154
44836,gene_100314169,2019-2020,2018-2019,2019-2020,8.815005
44837,gene_55915,2020-2021,2014-2015,2020-2021,8.814831
44838,sadrs,2012-2013,2009-2010,2012-2013,8.814770


In [6]:
query = "reviewer(s"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [04:55<00:00, 13.45s/it]

            tok changepoint_idx  start_idx    end_idx     value
105  reviewer(s       2018-2019  2017-2018  2018-2019  65.54713





Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2013,coauthore,favazza,coauthored,unf,jmcr,crommelin,refereed,wrote,reports,naish
2014,uavi,csbj,7483,4150,n=265,mdoc,1328,392582,"auc(0,2.5h",ipdma
2015,biostatistician,statistician,duly,electroencephalographer,proofreader,informatician,5.0.24,electrocardiographer,adjudication,abnm
2016,sundquist,monaci,j.-j,mirjana,y.-c.,marcelo,fursch,svetlana,konstantinos,mehrabi
2017,heiko,irina,zachary,cynthia,kenneth,johanna,kristin,nicholas,christina,vanessa
2018,matthias,maryam,katharina,mohamed,jens,cristina,evgeny,dmitry,tatiana,juliane
2019,communications,thanks,suppiah,michal,friederike,katharina,melanie,yoshihiro,cristian,kerstin
2020,communications,informationnature,thanks,yukiko,koichiro,aurelien,verena,ionut,maziar,wataru
2021,informationnature,communications,thanks,informationcommunication,justyna,kerstin,dorothee,mickael,mihaela,alberto


In [7]:
query = "medrxiv"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    save_file=True,
)
neighbor_thru_time_df.T

100%|██████████| 22/22 [05:30<00:00, 15.01s/it]


        tok changepoint_idx  start_idx    end_idx      value
89  medrxiv       2019-2020  2018-2019  2019-2020  69.958309


NameError: name 'output_file_folder' is not defined

In [None]:
query = "pandemic"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    save_file=True,
)
neighbor_thru_time_df.T

In [None]:
query = "cas9"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list,
    query,
    changepoints_df,
    concept_mapper,
    n_neighbors=10,
    save_file=True,
)
neighbor_thru_time_df.T

In [None]:
query = "cellline_cvcl_1698"
neighbor_thru_time_df = generate_neighbor_table(
    word2vec_model_list, query, changepoints_df, concept_mapper, n_neighbors=10
)
neighbor_thru_time_df.T