# Create Nodes and Relationships for SARS-CoV-2 Strain Data
Data are provided by [Nextstrain.org](https://nextstrain.org), a resource for real-time tracking of pathogen evolution in their [Git repository](https://github.com/nextstrain/ncov).

This notebook creates Node and Relationship .csv data files to create the following strain subgraph

![title](../docs/strains.png)

In [2]:
import pandas as pd

In [3]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [4]:
nextstrain = pd.read_csv("https://github.com/nextstrain/ncov/raw/master/data/metadata.tsv", sep = '\t')

In [5]:
nextstrain.head()

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,division_exposure,location,segment,length,host,age,sex,originating_lab,submitting_lab,authors,url,title,date_submitted
0,/Belgium/SN-03031/2020,ncov,EPI_ISL_416469,?,2020-03-03,Europe,Belgium,Kessel-Lo,Kessel-Lo,,genome,29845,Human,?,,"KU Leuven, Clinical and Epidemiological Virology","KU Leuven, Clinical and Epidemiological Virology",Vanmechelen et al,https://www.gisaid.org,?,2020-03-22
1,Anhui/SZ005/2020,ncov,EPI_ISL_413485,?,2020-01-24,Asia,China,Anhui,Anhui,Suzhou,genome,29860,Human,?,,"Department of microbiology laboratory,Anhui Pr...","Department of microbiology laboratory,Anhui Pr...",Li et al,https://www.gisaid.org,?,2020-03-05
2,Australia/NSW01/2020,ncov,EPI_ISL_407893,?,2020-01-24,Oceania,Australia,New South Wales,Hubei,Sydney,genome,29782,Human,?,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?,2020-02-06
3,Australia/NSW02/2020,ncov,EPI_ISL_408976,?,2020-01-22,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29741,Human,?,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Rockett et al,https://www.gisaid.org,?,2020-02-11
4,Australia/NSW03/2020,ncov,EPI_ISL_408977,?,2020-01-25,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29782,Human,?,,"Serology, Virology and OTDS Laboratories (SAVi...",NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,?,2020-02-11


## Transform and standardize data

Graph databases don't have "null" values. By setting missing values to '', they will not be represented in the graph.

In [6]:
nextstrain.replace('?', '', inplace=True)
nextstrain.replace('Unknown', '', inplace=True)

Taxonomy

In [7]:
# TODO replace with NCBI taxonomy lookup,
taxonomy_to_id = {'Human': '9606', 
                  'Rhinolophus affinis': '59477', 
                  'Manis javanica': '9974', 
                  'Canine': '9611'}
taxonomy_to_name = {'Human': 'Homo sapiens', 
                    'Rhinolophus affinis': 'Rhinolophus affinis',
                    'Manis javanica': 'Manis javanica',
                    'Canine': 'Canis'}
host_type = {'Human': 'Person', 
             'Rhinolophus affinis': 'Animal',
             'Manis javanica': 'Animal',
             'Canine': 'Animal'}

In [8]:
nextstrain['taxonomy_id'] = nextstrain['host'].apply(lambda s: taxonomy_to_id.get(s, ''))
nextstrain['taxonomy_name'] = nextstrain['host'].apply(lambda s: taxonomy_to_name.get(s, ''))
nextstrain['host_type'] = nextstrain['host'].apply(lambda s: host_type.get(s, ''))
nextstrain.head()

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,division_exposure,location,segment,length,host,age,sex,originating_lab,submitting_lab,authors,url,title,date_submitted,taxonomy_id,taxonomy_name,host_type
0,/Belgium/SN-03031/2020,ncov,EPI_ISL_416469,,2020-03-03,Europe,Belgium,Kessel-Lo,Kessel-Lo,,genome,29845,Human,,,"KU Leuven, Clinical and Epidemiological Virology","KU Leuven, Clinical and Epidemiological Virology",Vanmechelen et al,https://www.gisaid.org,,2020-03-22,9606,Homo sapiens,Person
1,Anhui/SZ005/2020,ncov,EPI_ISL_413485,,2020-01-24,Asia,China,Anhui,Anhui,Suzhou,genome,29860,Human,,,"Department of microbiology laboratory,Anhui Pr...","Department of microbiology laboratory,Anhui Pr...",Li et al,https://www.gisaid.org,,2020-03-05,9606,Homo sapiens,Person
2,Australia/NSW01/2020,ncov,EPI_ISL_407893,,2020-01-24,Oceania,Australia,New South Wales,Hubei,Sydney,genome,29782,Human,,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,,2020-02-06,9606,Homo sapiens,Person
3,Australia/NSW02/2020,ncov,EPI_ISL_408976,,2020-01-22,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29741,Human,,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Rockett et al,https://www.gisaid.org,,2020-02-11,9606,Homo sapiens,Person
4,Australia/NSW03/2020,ncov,EPI_ISL_408977,,2020-01-25,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29782,Human,,,"Serology, Virology and OTDS Laboratories (SAVi...",NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,,2020-02-11,9606,Homo sapiens,Person


country

In [9]:
nextstrain['country'] = nextstrain['country'].apply(lambda c: str(c).replace('China', 'Mainland China'))

admin1

In [10]:
nextstrain['admin1'] = nextstrain['division']

admin2 (TODO check this data)

In [11]:
def get_admin2(location):
    loc = str(location)
    admin2 = ''
    if "County" in loc or "District" in loc:
        admin2 = loc
    return admin2

In [12]:
nextstrain['admin2'] = nextstrain['location'].apply(get_admin2)

City

In [13]:
def get_city(location):
    loc = str(location)
    city = ''
    if not ("County" in loc or "District" in loc):
        city = loc
    return city

In [14]:
nextstrain['city'] = nextstrain['location'].apply(get_city)

### Read Clade information
Clade info is missing in the file downloaded above. The file with clade info can only be downloaded manually from the Nextstrain.org web site. Furthermore, the file has fewer strains and even less clade assignments. File Github issues (https://github.com/nextstrain/ncov/issues/207, https://github.com/nextstrain/ncov/issues/208

In [15]:
clade = pd.read_csv("../cached_data/nextstrain_ncov_metadata.tsv", sep = '\t')

In [16]:
clade.head()

Unnamed: 0,Strain,Admin Division,Originating Lab,Submitting Lab,gisaid_epi_isl,url,Country,Region,Host,Submission Date,Location,Clade,Collection Data,Author,Exposure History,genbank_accession
0,Wuhan/HBCDC-HB-04/2020,Hubei,"Union Hospital of Tongji Medical College, Huaz...",Hubei Provincial Center for Disease Control an...,EPI_ISL_412980,,China,Asia,Human,One month ago,Wuhan,B,2020-01-18,Fang et al,,
1,China/Shanghai/SH0010,Shanghai,"Shanghai Public Health Clinical Center, Shangh...",National Research Center for Translational Med...,EPI_ISL_416323,,China,Asia,Human,1-2 days ago,,B,2020-01-29,Wang et al,,
2,Wuhan/HBCDC-HB-02/2020,Hubei,The Central Hospital Of Wuhan,Hubei Provincial Center for Disease Control an...,EPI_ISL_412978,,China,Asia,Human,One month ago,Wuhan,B,2020-01-17,Fang et al,,
3,Wuhan/HBCDC-HB-06/2020,Hubei,Wuhan Lung Hospital,Hubei Provincial Center for Disease Control an...,EPI_ISL_412982,,China,Asia,Human,One month ago,Wuhan,B,2020-02-07,Fang et al,,
4,China/Shanghai/SH0075,Shanghai,"Shanghai Public Health Clinical Center, Shangh...",National Research Center for Translational Med...,EPI_ISL_416378,,China,Asia,Human,1-2 days ago,,B,2020-01-30,Wang et al,,


### Merge strain data with clade assignments

In [17]:
clade = clade[['Strain', 'Clade']]
clade.rename(columns={'Strain': 'strain', 'Clade': 'clade'}, inplace=True)
nextstrain = nextstrain.merge(clade, on='strain', how='left')
nextstrain.fillna('', inplace=True)

In [18]:
nextstrain.head()

Unnamed: 0,strain,virus,gisaid_epi_isl,genbank_accession,date,region,country,division,division_exposure,location,segment,length,host,age,sex,originating_lab,submitting_lab,authors,url,title,date_submitted,taxonomy_id,taxonomy_name,host_type,admin1,admin2,city,clade
0,/Belgium/SN-03031/2020,ncov,EPI_ISL_416469,,2020-03-03,Europe,Belgium,Kessel-Lo,Kessel-Lo,,genome,29845,Human,,,"KU Leuven, Clinical and Epidemiological Virology","KU Leuven, Clinical and Epidemiological Virology",Vanmechelen et al,https://www.gisaid.org,,2020-03-22,9606,Homo sapiens,Person,Kessel-Lo,,,
1,Anhui/SZ005/2020,ncov,EPI_ISL_413485,,2020-01-24,Asia,Mainland China,Anhui,Anhui,Suzhou,genome,29860,Human,,,"Department of microbiology laboratory,Anhui Pr...","Department of microbiology laboratory,Anhui Pr...",Li et al,https://www.gisaid.org,,2020-03-05,9606,Homo sapiens,Person,Anhui,,Suzhou,B
2,Australia/NSW01/2020,ncov,EPI_ISL_407893,,2020-01-24,Oceania,Australia,New South Wales,Hubei,Sydney,genome,29782,Human,,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,,2020-02-06,9606,Homo sapiens,Person,New South Wales,,Sydney,B
3,Australia/NSW02/2020,ncov,EPI_ISL_408976,,2020-01-22,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29741,Human,,,Centre for Infectious Diseases and Microbiolog...,NSW Health Pathology - Institute of Clinical P...,Rockett et al,https://www.gisaid.org,,2020-02-11,9606,Homo sapiens,Person,New South Wales,,Sydney,
4,Australia/NSW03/2020,ncov,EPI_ISL_408977,,2020-01-25,Oceania,Australia,New South Wales,New South Wales,Sydney,genome,29782,Human,,,"Serology, Virology and OTDS Laboratories (SAVi...",NSW Health Pathology - Institute of Clinical P...,Eden et al,https://www.gisaid.org,,2020-02-11,9606,Homo sapiens,Person,New South Wales,,Sydney,


## Create and save nodes

Node: Strain

In [19]:
strain = nextstrain[['strain','clade','gisaid_epi_isl','genbank_accession','date']].copy()
strain.drop_duplicates(inplace=True)
strain.rename(columns={"strain": "name:ID(strain_id)"}, inplace=True)
strain[':LABEL'] = "Strain"
strain.to_csv("../data/nodes/Strain.csv", index=False)
strain.head()

Unnamed: 0,name:ID(strain_id),clade,gisaid_epi_isl,genbank_accession,date,:LABEL
0,/Belgium/SN-03031/2020,,EPI_ISL_416469,,2020-03-03,Strain
1,Anhui/SZ005/2020,B,EPI_ISL_413485,,2020-01-24,Strain
2,Australia/NSW01/2020,B,EPI_ISL_407893,,2020-01-24,Strain
3,Australia/NSW02/2020,,EPI_ISL_408976,,2020-01-22,Strain
4,Australia/NSW03/2020,,EPI_ISL_408977,,2020-01-25,Strain


Node: Host

In [20]:
host = nextstrain[['taxonomy_id','taxonomy_name']].copy()
host.query("taxonomy_id != ''", inplace=True)
host.drop_duplicates(inplace=True) 
host.rename(columns={'taxonomy_id': 'taxonomy_id:ID(host_taxonomy_id)', 'taxonomy_name': 'name'}, inplace=True)
host[':LABEL'] = "Host"
host.to_csv("../data/nodes/Host.csv", index=False)
host.head()

Unnamed: 0,taxonomy_id:ID(host_taxonomy_id),name,:LABEL
0,9606,Homo sapiens,Host
1065,59477,Rhinolophus affinis,Host
1066,9611,Canis,Host
1067,9974,Manis javanica,Host


Node: Person/Animal

In [21]:
person_animal = nextstrain[['strain','age','sex','division_exposure','taxonomy_id','date_submitted','host_type']].copy()
person_animal.query("taxonomy_id != ''", inplace=True)
person_animal.drop_duplicates(inplace=True)
# use strain as a unique id for the host
person_animal.rename(columns={'strain':'id:ID(host_id)', 'division_exposure':'exposure_location','host_type': ':LABEL'}, inplace=True)
person_animal.to_csv("../data/nodes/PersonAnimal.csv", index=False)
person_animal.head()

Unnamed: 0,id:ID(host_id),age,sex,exposure_location,taxonomy_id,date_submitted,:LABEL
0,/Belgium/SN-03031/2020,,,Kessel-Lo,9606,2020-03-22,Person
1,Anhui/SZ005/2020,,,Anhui,9606,2020-03-05,Person
2,Australia/NSW01/2020,,,Hubei,9606,2020-02-06,Person
3,Australia/NSW02/2020,,,New South Wales,9606,2020-02-11,Person
4,Australia/NSW03/2020,,,New South Wales,9606,2020-02-11,Person


Node: Country

In [22]:
country = nextstrain[['country']].copy()
country.drop_duplicates(inplace=True)
country.query("country != ''", inplace=True)
country.rename(columns={'country': 'name:ID(strain_country)'}, inplace=True)
country[':LABEL'] = "Country"
country.to_csv("../data/nodes/Country.csv", index=False)
country.head()

Unnamed: 0,name:ID(strain_country),:LABEL
0,Belgium,Country
1,Mainland China,Country
2,Australia,Country
52,Brazil,Country
69,Cambodia,Country


Node: Admin1

In [23]:
admin1 = nextstrain[['admin1']].copy()
admin1.drop_duplicates(inplace=True)
admin1.query("admin1 != ''", inplace=True)
admin1.rename(columns={'admin1': 'name:ID(strain_admin1)'}, inplace=True)
admin1[':LABEL'] = 'Admin1'
admin1.to_csv("../data/nodes/Admin1.csv", index=False)
admin1.head()

Unnamed: 0,name:ID(strain_admin1),:LABEL
0,Kessel-Lo,Admin1
1,Anhui,Admin1
2,New South Wales,Admin1
15,Queensland,Admin1
20,Victoria,Admin1


Node: Admin2

In [24]:
admin2 = nextstrain[['admin2']].copy()
admin2.drop_duplicates(inplace=True)
admin2.query("admin2 != ''", inplace=True)
admin2.rename(columns={'admin2': 'name:ID(strain_admin2)'}, inplace=True)
admin2[':LABEL'] = 'Admin2'
admin2.to_csv("../data/nodes/Admin2.csv", index=False)
admin2.head()

Unnamed: 0,name:ID(strain_admin2),:LABEL
338,Heinsberg District,Admin2
817,Sonoma County,Admin2
818,Solano County,Admin2
826,San Diego County,Admin2
829,Orange County,Admin2


Node: City

In [25]:
city = nextstrain[['city']].copy()
city.drop_duplicates(inplace=True)
city.query("city != ''", inplace=True)
city.rename(columns={'city': 'name:ID(strain_city)'}, inplace=True)
city[':LABEL'] = "City"
city.to_csv("../data/nodes/City.csv", index=False)
city.head()

Unnamed: 0,name:ID(strain_city),:LABEL
0,,City
1,Suzhou,City
2,Sydney,City
15,Gold Coast,City
20,Clayton,City


## Create and save relationships

Relationship: Person/Animal-CARRIES-Strain (note, strain is also used as a key for a person/animal)

In [26]:
pCs = nextstrain[['taxonomy_id','strain']].copy()
pCs.drop_duplicates(inplace=True)
pCs.query("taxonomy_id != ''", inplace=True)
pCs = pCs[['strain']]
pCs['host_id'] = pCs['strain']
pCs.rename(columns={'host_id': ':START_ID(host_id)', 'strain': ':END_ID(strain_id)'}, inplace=True)
pCs[':TYPE'] = "CARRIES"
pCs.to_csv("../data/relationships/PersonAnimal-CARRIES-Strain.csv", index=False)
pCs.head()

Unnamed: 0,:END_ID(strain_id),:START_ID(host_id),:TYPE
0,/Belgium/SN-03031/2020,/Belgium/SN-03031/2020,CARRIES
1,Anhui/SZ005/2020,Anhui/SZ005/2020,CARRIES
2,Australia/NSW01/2020,Australia/NSW01/2020,CARRIES
3,Australia/NSW02/2020,Australia/NSW02/2020,CARRIES
4,Australia/NSW03/2020,Australia/NSW03/2020,CARRIES


Relationship: Strain-FOUND_IN-Country

In [27]:
sFc = nextstrain[['strain', 'country']].copy()
sFc.drop_duplicates(inplace=True)
sFc.query("country != ''", inplace=True)
sFc.rename(columns={'strain': ':START_ID(strain_id)', 'country': ':END_ID(strain_country)'}, inplace=True)
sFc[':TYPE'] = "FOUND_IN"
sFc.to_csv("../data/relationships/Strain-FOUND_IN-Country.csv", index=False)
sFc.head()

Unnamed: 0,:START_ID(strain_id),:END_ID(strain_country),:TYPE
0,/Belgium/SN-03031/2020,Belgium,FOUND_IN
1,Anhui/SZ005/2020,Mainland China,FOUND_IN
2,Australia/NSW01/2020,Australia,FOUND_IN
3,Australia/NSW02/2020,Australia,FOUND_IN
4,Australia/NSW03/2020,Australia,FOUND_IN


Relationship: Person/Animal-LOCATED_IN-Country

In [28]:
iLc = nextstrain[['strain', 'taxonomy_id', 'country']].copy()
iLc.drop_duplicates(inplace=True)
iLc.query("taxonomy_id != ''", inplace=True)
iLc.query("country != ''", inplace=True)
iLc = iLc[['strain', 'country']]
iLc.rename(columns={'strain': ':START_ID(host_id)', 'country': ':END_ID(strain_country)'}, inplace=True)
iLc[':TYPE'] = "LOCATED_IN"
iLc.to_csv("../data/relationships/PersonAnimal-LOCATED_IN-Country.csv", index=False)
iLc.head()

Unnamed: 0,:START_ID(host_id),:END_ID(strain_country),:TYPE
0,/Belgium/SN-03031/2020,Belgium,LOCATED_IN
1,Anhui/SZ005/2020,Mainland China,LOCATED_IN
2,Australia/NSW01/2020,Australia,LOCATED_IN
3,Australia/NSW02/2020,Australia,LOCATED_IN
4,Australia/NSW03/2020,Australia,LOCATED_IN


Relationship: Strain-FOUND_IN-Admin1

In [29]:
sFa1 = nextstrain[['strain', 'admin1']].copy()
sFa1.drop_duplicates(inplace=True)
sFa1.query("admin1 != ''", inplace=True)
sFa1.rename(columns={'strain': ':START_ID(strain_id)', 'admin1': ':END_ID(strain_admin1)'}, inplace=True)
sFa1[':TYPE'] = "FOUND_IN"
sFa1.to_csv("../data/relationships/Strain-FOUND_IN-Admin1.csv", index=False)
sFa1.head()

Unnamed: 0,:START_ID(strain_id),:END_ID(strain_admin1),:TYPE
0,/Belgium/SN-03031/2020,Kessel-Lo,FOUND_IN
1,Anhui/SZ005/2020,Anhui,FOUND_IN
2,Australia/NSW01/2020,New South Wales,FOUND_IN
3,Australia/NSW02/2020,New South Wales,FOUND_IN
4,Australia/NSW03/2020,New South Wales,FOUND_IN


Relationship: Person/Animal-LOCATED_IN-Admin1

In [30]:
pLa1 = nextstrain[['strain', 'taxonomy_id', 'admin1']].copy()
pLa1.drop_duplicates(inplace=True)
pLa1.query("taxonomy_id != ''", inplace=True) # exclude strains that don't occure in organisms
pLa1.query("admin1 != ''", inplace=True)
pLa1 = pLa1[['strain', 'admin1']]
pLa1.rename(columns={'strain': ':START_ID(host_id)', 'admin1': ':END_ID(strain_admin1)'}, inplace=True)
pLa1[':TYPE'] = "LOCATED_IN"
pLa1.to_csv("../data/relationships/PersonAnimal-LOCATED_IN-Admin1.csv", index=False)
pLa1.head()

Unnamed: 0,:START_ID(host_id),:END_ID(strain_admin1),:TYPE
0,/Belgium/SN-03031/2020,Kessel-Lo,LOCATED_IN
1,Anhui/SZ005/2020,Anhui,LOCATED_IN
2,Australia/NSW01/2020,New South Wales,LOCATED_IN
3,Australia/NSW02/2020,New South Wales,LOCATED_IN
4,Australia/NSW03/2020,New South Wales,LOCATED_IN


Relationship: Strain-FOUND_IN-Admin2

In [31]:
sFa2 = nextstrain[['strain', 'admin2']].copy()
sFa2.drop_duplicates(inplace=True)
sFa2.query("admin2 != ''", inplace=True)
sFa2.rename(columns={'strain': ':START_ID(strain_id)', 'admin2': ':END_ID(strain_admin2)'}, inplace=True)
sFa2[':TYPE'] = "FOUND_IN"
sFa2.to_csv("../data/relationships/Strain-FOUND_IN-Admin2.csv", index=False)
sFa2.head()

Unnamed: 0,:START_ID(strain_id),:END_ID(strain_admin2),:TYPE
338,Germany/NRW-01/2020,Heinsberg District,FOUND_IN
340,Germany/NRW-02-1/2020,Heinsberg District,FOUND_IN
341,Germany/NRW-03/2020,Heinsberg District,FOUND_IN
342,Germany/NRW-04/2020,Heinsberg District,FOUND_IN
343,Germany/NRW-05/2020,Heinsberg District,FOUND_IN


Relationship: Person/Animal-LOCATED_IN-Admin2

In [32]:
pLa2 = nextstrain[['strain', 'taxonomy_id', 'admin2']].copy()
pLa2.drop_duplicates(inplace=True)
pLa2.query("taxonomy_id != ''", inplace=True) # exclude strains that don't occure in organisms
pLa2.query("admin2 != ''", inplace=True)
pLa2 = pLa2[['strain', 'admin2']]
pLa2.rename(columns={'strain': ':START_ID(host_id)', 'admin2': ':END_ID(strain_admin2)'}, inplace=True)
pLa2[':TYPE'] = "LOCATED_IN"
pLa2.to_csv("../data/relationships/PersonAnimal-LOCATED_IN-Admin2.csv", index=False)
pLa2.head()

Unnamed: 0,:START_ID(host_id),:END_ID(strain_admin2),:TYPE
338,Germany/NRW-01/2020,Heinsberg District,LOCATED_IN
340,Germany/NRW-02-1/2020,Heinsberg District,LOCATED_IN
341,Germany/NRW-03/2020,Heinsberg District,LOCATED_IN
342,Germany/NRW-04/2020,Heinsberg District,LOCATED_IN
343,Germany/NRW-05/2020,Heinsberg District,LOCATED_IN


Relationship: Strain-FOUND_IN-City

In [33]:
sFct = nextstrain[['strain', 'city']].copy()
sFct.drop_duplicates(inplace=True)
sFct.query("city != ''", inplace=True)
sFct.rename(columns={'strain': ':START_ID(strain_id)', 'city': ':END_ID(strain_city)'}, inplace=True)
sFct[':TYPE'] = "FOUND_IN"
sFct.to_csv("../data/relationships/Strain-FOUND_IN-City.csv", index=False)
sFct.head()

Unnamed: 0,:START_ID(strain_id),:END_ID(strain_city),:TYPE
0,/Belgium/SN-03031/2020,,FOUND_IN
1,Anhui/SZ005/2020,Suzhou,FOUND_IN
2,Australia/NSW01/2020,Sydney,FOUND_IN
3,Australia/NSW02/2020,Sydney,FOUND_IN
4,Australia/NSW03/2020,Sydney,FOUND_IN


Relationship: Person/Animal-LOCATED_IN-City

In [34]:
paLc = nextstrain[['strain', 'taxonomy_id', 'city']].copy()
paLc.drop_duplicates(inplace=True)
paLc.query("taxonomy_id != ''", inplace=True) # exclude strains that don't occure in organisms
paLc.query("city != ''", inplace=True)
paLc = paLc[['strain', 'city']]
paLc.rename(columns={'strain': ':START_ID(host_id)', 'city': ':END_ID(strain_city)'}, inplace=True)
paLc[':TYPE'] = "LOCATED_IN"
paLc.to_csv("../data/relationships/PersonAnimal-LOCATED_IN-City.csv", index=False)
paLc.head()

Unnamed: 0,:START_ID(host_id),:END_ID(strain_city),:TYPE
0,/Belgium/SN-03031/2020,,LOCATED_IN
1,Anhui/SZ005/2020,Suzhou,LOCATED_IN
2,Australia/NSW01/2020,Sydney,LOCATED_IN
3,Australia/NSW02/2020,Sydney,LOCATED_IN
4,Australia/NSW03/2020,Sydney,LOCATED_IN


Relationship: Pathogen-HAS-Strain

In [35]:
pHs = nextstrain[['strain']].copy()
pHs.drop_duplicates(inplace=True)
pHs.query("strain != ''", inplace=True)
pHs['taxonomy_id'] = '2697049'
pHs.rename(columns={'taxonomy_id': ':START_ID(pathogen_taxonomy_id)', 'strain': ':END_ID(strain_id)'}, inplace=True)
pHs[':TYPE'] = "HAS"
pHs.to_csv("../data/relationships/Pathogen-HAS-Strain.csv", index=False)
pHs.head()

Unnamed: 0,:END_ID(strain_id),:START_ID(pathogen_taxonomy_id),:TYPE
0,/Belgium/SN-03031/2020,2697049,HAS
1,Anhui/SZ005/2020,2697049,HAS
2,Australia/NSW01/2020,2697049,HAS
3,Australia/NSW02/2020,2697049,HAS
4,Australia/NSW03/2020,2697049,HAS


Relationship: Person/Animal-IS_A-Host

In [36]:
pIh = nextstrain[['strain', 'taxonomy_id']].copy()
pIh.drop_duplicates(inplace=True)
pIh.query("taxonomy_id != ''", inplace=True) # exclude strains that don't occure in organisms
pIh.rename(columns={'strain': ':START_ID(host_id)', 'taxonomy_id': ':END_ID(host_taxonomy_id)'}, inplace=True)
pIh[':TYPE'] = "IS_A"
pIh.to_csv("../data/relationships/PersonAnimal-IS_A-Host.csv", index=False)
pIh.head()

Unnamed: 0,:START_ID(host_id),:END_ID(host_taxonomy_id),:TYPE
0,/Belgium/SN-03031/2020,9606,IS_A
1,Anhui/SZ005/2020,9606,IS_A
2,Australia/NSW01/2020,9606,IS_A
3,Australia/NSW02/2020,9606,IS_A
4,Australia/NSW03/2020,9606,IS_A
