In [1]:
import pandas as pd

In [2]:
#Original GISAID metadata file contains 24252 samples not including reference strain & 4 error rows [0,47,48,50,51]. Download by David Chen
bad_metadata = pd.read_csv('/Users/dolteanu/local_documents/Coding/Ontario covid data/ontario_metadata.tsv',sep='\t')
metadata = bad_metadata.drop(index=[0,47,48,50,51])
# Remove prefix to match my thesis data
strain_fixed = metadata['strain'].str.removeprefix("hCoV-19/")
metadata['strain'] = strain_fixed

In [3]:
# Remove duplicates by accession id as done in thesis dataset, 24249 remaining
metadata.drop_duplicates(subset=['strain'],inplace=True)


In [5]:
# JMIR publication Ontario covid dataset used
david_24244 = pd.read_csv('/Users/dolteanu/local_documents/Coding/Ontario covid data/GISAID_Ontario_ID.csv',header=None)
#convert dataframe to series
david_24244 = david_24244.iloc[:,0]

In [6]:
# Thesis Ontario covid dataset used
daniel_24249 = pd.read_csv('/Users/dolteanu/local_documents/Coding/Ontario covid data/Ontario_covid/gisaid_clade_metadata.csv',header=None,names=['strain','GISAID_clade'])
# daniel_24249 = daniel_24249.iloc[:,0]

In [8]:
# Thesis Ontario covid with Gisaid clades having < 10 samples removed i.e Clade L (7 samples)
daniel_cladeL = pd.read_csv('/Users/dolteanu/local_documents/Coding/Ontario covid data/Ontario_covid/gisaid_filtered<10.csv', header=None)
daniel_cladeL = daniel_cladeL.iloc[:,0]

In [9]:
# Samples in Thesis that are not in JMIR paper 
not_JMIR_set = set(daniel_24249['strain']).difference(david_24244)
metadata.loc[metadata['strain'].isin(not_JMIR_set),['gisaid_epi_isl']]

Unnamed: 0,gisaid_epi_isl
71,EPI_ISL_538336
72,EPI_ISL_538338
73,EPI_ISL_538339
1086,EPI_ISL_755880
1665,EPI_ISL_933618


In [10]:
# Confirm no samples present in JMIR publication that are not present in thesis
set(david_24244).difference(daniel_24249['strain'])

set()

In [11]:
# Samples filtered from Thesis Ontario covid dataset of gisaid clade <10 samples (n=7); these are present in JMIR paper
cladeL_set = set(david_24244).difference(daniel_cladeL)
metadata.loc[metadata['strain'].isin(cladeL_set),['gisaid_epi_isl']]

Unnamed: 0,gisaid_epi_isl
6,EPI_ISL_418328
36,EPI_ISL_418379
507,EPI_ISL_591219
646,EPI_ISL_413014
647,EPI_ISL_413015
1027,EPI_ISL_418327
1652,EPI_ISL_933605


In [7]:
# Confirm no samples present in original 'metadata' set from David Chen were lost in thesis filtering
set(daniel_24249['strain']).difference(metadata['strain'])

set()

In [22]:
# Filtering of Ontario covid dataset to get gisaid clades > 10 samples for csv export
filtered = metadata[metadata['strain'].isin(daniel_cladeL)]
ontario_list=filtered['gisaid_epi_isl']

In [None]:
# Set of cells for generating EPI_SET.csv of datasets used, uncomment appropriate cell (can be run sequentially but csv name need be changed in last cell & re-)

In [21]:
# Ontario covid dataset EPI_ISL_IDs 
metadata['gisaid_epi_isl'].to_csv('./Ontario_covid_EPI.csv',index=False)

In [14]:
# Nextstrain dataset EPI_ISL_IDs 
test_metadata = pd.read_csv('/Users/dolteanu/local_documents/Coding/Gisaid data 01:11:22/hcov_global_2022-01-09_23-30/hcov_global.tsv',sep='\t')
nextstrain_metadata= test_metadata['gisaid_epi_isl']
nextstrain_metadata.to_csv('./Nextstrain_EPI.csv',index=False)

In [15]:
# Filtering of Nextstrain dataset to get gisaid clades > 20 samples for csv export
gisaid = test_metadata['GISAID_clade'].value_counts()>20
Clades_to_pick = gisaid[gisaid==True].index
nextstrain_gisaid_filtered=test_metadata.loc[test_metadata['GISAID_clade'].isin(Clades_to_pick),['gisaid_epi_isl']]

In [18]:
# Samples filtered from Thesis Nextstrain dataset of gisaid clade <20 samples (n=36)(Clades GV, L, V); these are present in JMIR paper
set(nextstrain_metadata).difference(nextstrain_gisaid_filtered['gisaid_epi_isl'])

{'EPI_ISL_1073625',
 'EPI_ISL_1199027',
 'EPI_ISL_1714048',
 'EPI_ISL_1714696',
 'EPI_ISL_2232189',
 'EPI_ISL_3366469',
 'EPI_ISL_402125',
 'EPI_ISL_403930',
 'EPI_ISL_406798',
 'EPI_ISL_410301',
 'EPI_ISL_419558',
 'EPI_ISL_426629',
 'EPI_ISL_434534',
 'EPI_ISL_451958',
 'EPI_ISL_456187',
 'EPI_ISL_456201',
 'EPI_ISL_4771900',
 'EPI_ISL_482679',
 'EPI_ISL_512092',
 'EPI_ISL_541018',
 'EPI_ISL_547451',
 'EPI_ISL_574607',
 'EPI_ISL_579489',
 'EPI_ISL_582019',
 'EPI_ISL_582617',
 'EPI_ISL_621282',
 'EPI_ISL_636492',
 'EPI_ISL_649124',
 'EPI_ISL_678479',
 'EPI_ISL_718143',
 'EPI_ISL_744521',
 'EPI_ISL_766041',
 'EPI_ISL_794614',
 'EPI_ISL_802543',
 'EPI_ISL_827780',
 'EPI_ISL_830256'}

In [19]:
#  Only for Nextstrain dataset to get nextstrain clades > 20 samples
nextstrain = test_metadata['Nextstrain_clade'].value_counts()>20
next_to_pick = nextstrain[nextstrain==True].index
nextstrain_clade_filtered=test_metadata.loc[test_metadata['Nextstrain_clade'].isin(next_to_pick),['gisaid_epi_isl']]

In [20]:
# Samples filtered from Thesis Nextstrain dataset of nextstrain clades <20 samples (n=103)(Clades 20D, 21H (Mu), 20E (EU1), 20F, 20G, 21G (Lambda), 21D (Eta), 21F (Iota), 21L (Omicron), 21C (Epsilon))
set(nextstrain_metadata).difference(nextstrain_clade_filtered['gisaid_epi_isl'])

{'EPI_ISL_1037197',
 'EPI_ISL_1073625',
 'EPI_ISL_1098645',
 'EPI_ISL_1134769',
 'EPI_ISL_1167702',
 'EPI_ISL_1199027',
 'EPI_ISL_1253633',
 'EPI_ISL_1258014',
 'EPI_ISL_1527039',
 'EPI_ISL_1593727',
 'EPI_ISL_1734233',
 'EPI_ISL_1790091',
 'EPI_ISL_2230700',
 'EPI_ISL_2232189',
 'EPI_ISL_2276073',
 'EPI_ISL_2301629',
 'EPI_ISL_2346429',
 'EPI_ISL_2391364',
 'EPI_ISL_2447843',
 'EPI_ISL_2501709',
 'EPI_ISL_2604870',
 'EPI_ISL_2610471',
 'EPI_ISL_2610683',
 'EPI_ISL_2691896',
 'EPI_ISL_2788186',
 'EPI_ISL_2802859',
 'EPI_ISL_2928027',
 'EPI_ISL_3020128',
 'EPI_ISL_3026018',
 'EPI_ISL_3160715',
 'EPI_ISL_3185575',
 'EPI_ISL_3266859',
 'EPI_ISL_3274366',
 'EPI_ISL_3366469',
 'EPI_ISL_3547003',
 'EPI_ISL_4273978',
 'EPI_ISL_4273995',
 'EPI_ISL_4372793',
 'EPI_ISL_4497058',
 'EPI_ISL_4771900',
 'EPI_ISL_498544',
 'EPI_ISL_498546',
 'EPI_ISL_512092',
 'EPI_ISL_5146279',
 'EPI_ISL_5201410',
 'EPI_ISL_521862',
 'EPI_ISL_522705',
 'EPI_ISL_5301577',
 'EPI_ISL_530239',
 'EPI_ISL_530339',
 'EPI_I

In [23]:
# Uncomment to generate csv of filtered dataset's EPI_ISL_IDs
nextstrain_gisaid_filtered.to_csv('./Nextstrain_gisaid>20_EPI.csv',index=False)
nextstrain_clade_filtered.to_csv('./Nextstrain_clade>20_EPI.csv',index=False)
ontario_list.to_csv('./Ontario_covid_gisaid>10_EPI.csv',index=False)