# Create NT files from CTD csvs
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Takes local CTD csvs and turns each into a .nt file

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import subprocess
import math

## Functions

In [2]:
def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=0):
    """
    Input:
        DF: some rows and columns of a dataframe
        STR: name for the output file, include filetype .nt
        STR: subj_url is the url to be used for all subjects
        STR: subj_col is the column from which to get the id for the subj
        STR: obj_url is the url to be used for all objects
        STR: obj_col is the column from which to get the id for the obj
        STR: OPTIONAL odd_url is for when a subset obj/subj of require a different url 
    Output:
        NT file
    """
    f = open(output_file,'w')
    for index, row in df.iterrows():
        subj = '<' + subj_url + str(row[subj_col]) + '> '
        pred = '<' + 'http://ian.ie/' + str(row[pred_col]) + '> '
        if 'OMIM' in str(row[obj_col]):
            row[obj_col] = str(row[obj_col]).replace('OMIM:', '')
            obj = '<' + odd_url + str(row[obj_col]) + '> '
        else:
            obj = '<' + obj_url + str(row[obj_col]) + '> '
        f.write(subj + pred + obj + '.' + '\n')
    f.close()

## Download Databases

In [3]:
subprocess.call('pip3 install wget', shell=True)
import wget
wget.download('http://ctdbase.org/reports/CTD_chem_gene_ixns.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_chemicals_diseases.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_chem_pathways_enriched.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_genes_diseases.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_genes_pathways.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_diseases_pathways.csv.gz')
wget.download('http://ctdbase.org/reports/CTD_pheno_term_ixns.csv.gz')

'CTD_pheno_term_ixns.csv.gz'

In [5]:
# Move all the csvs to a subfolder and unzip them
subprocess.call('mkdir csvs', shell=True)
subprocess.call('mv *.gz csvs/', shell=True)
subprocess.call('gunzip csvs/*.gz', shell=True)

0

In [6]:
# too ambitious?? 
# TODO attempt to make one func to import and process all ctd databases
# def ctd_to_rdf(csv, output_file, subj_url, subj_col, obj_url, obj_col, pred_col):
#     """
#     """
#     df = pd.read_csv(csv, skiprows=27)
#     df = df.drop(0)
#     convert_df_nt(df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col)

## CHEM-GENE 

In [38]:
# Read in CTD sample, skipping the intro rows8888
df_cg = pd.read_csv('csvs/CTD_chem_gene_ixns.csv', skiprows=27)
df_cg = df_cg.drop(0)
df_cg = df_cg.rename(columns={'# ChemicalName': 'ChemicalName'}) # rename of a column

In [None]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
s = df_cg['InteractionActions'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'InteractionActions'
df_cg = df_cg.join(s.apply(lambda x: pd.Series(x.split('|'))))

In [None]:
# Make the new column prettier
df_cg = df_cg.rename(columns={0: 'Predicate'})
df_cg['Predicate'] = df_cg.Predicate.str.replace('^', '_')
df_cg['Predicate'] = df_cg.Predicate.str.replace(' ', '_')

In [None]:
# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

In [None]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/ctd.gene/' 
obj_col = 'GeneID'
pred_col = 'Predicate'

convert_df_nt(df_cg, 'output_cg.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Chem-Disease

In [14]:
# Read in CTD sample, skipping the intro rows
df_cd = pd.read_csv('csvs/CTD_chemicals_diseases.csv', skiprows=27, nrows = 100)
df_cd = df_cd.drop(0)

In [15]:
'OMIM' in (df_cd.loc[65,'DiseaseID'])

True

In [16]:
# Process DiseaseID so as to be usable in url
df_cd['DiseaseID'] = df_cd['DiseaseID'].str.replace('MESH:', '')

In [17]:
# Create Predicate Column
def cd_predicate(r):
    """
    Create predicate from directevidence if available
    """
    df_cd['DirectEvidence'] = df_cd.DirectEvidence.astype(str) 
    if r['DirectEvidence'] == "nan":
        return 'associated_by_inference_via_' + str(r.InferenceGeneSymbol)
    else:
        return 'associated_directly_with'
    
df_cd['Predicate'] = df_cd.apply(cd_predicate, axis=1)

In [18]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/mesh/'
obj_url_2 = 'http://identifiers.org/omim/' # to use when CTD gives an omim disease id
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_cd, 'output_cd.nt', subj_url, subj_col, obj_url, obj_col, pred_col, obj_url_2)

In [37]:
df_cd

Unnamed: 0,# ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs,Predicate
1,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,D054198,therapeutic,,,,4519131,associated_directly_with
2,10074-G5,C534883,,Adenocarcinoma,D000230,,MYC,4.40,,26432044,associated_directly_with
3,10074-G5,C534883,,Adenocarcinoma of lung,C538231,,MYC,4.64,,26656844|27602772,associated_directly_with
4,10074-G5,C534883,,Burkitt Lymphoma,D002051,,MYC,5.44,113970.0,,associated_directly_with
5,10074-G5,C534883,,Carcinoma,D002277,,MYC,4.41,,2228319,associated_directly_with
6,10074-G5,C534883,,"Carcinoma, Hepatocellular",D006528,,MYC,4.15,,12029619|15565109|29698666|9029167,associated_directly_with
7,10074-G5,C534883,,"Carcinoma, Merkel Cell",D015266,,MYC,6.46,,25277525,associated_directly_with
8,10074-G5,C534883,,"Carcinoma, Non-Small-Cell Lung",D002289,,MYC,4.60,,24688052,associated_directly_with
9,10074-G5,C534883,,"Carcinoma, Squamous Cell",D002294,,MYC,4.54,,26432044,associated_directly_with
10,10074-G5,C534883,,"Cardiomyopathy, Hypertrophic",D002312,,MYC,5.09,,22000973,associated_directly_with


## Gene Disease

In [19]:
# Read in CTD sample, skipping the intro rows
df_gd = pd.read_csv('csvs/CTD_genes_diseases.csv', skiprows=27, nrows = 100)
df_gd = df_gd.drop(0)


In [20]:
df_gd[-10:]

Unnamed: 0,# GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs
90,14-3-3ZETA,36059.0,Edema,MESH:D004487,,lead acetate,3.53,,25031709
91,14-3-3ZETA,36059.0,Encephalocele,MESH:D004677,,lead acetate,5.58,,11402670
92,14-3-3ZETA,36059.0,Epilepsy,MESH:D004827,,Pentylenetetrazole,4.35,,12632110|16122723|17604186|17969896|20002063|2...
93,14-3-3ZETA,36059.0,"Epilepsy, Absence",MESH:D004832,,Pentylenetetrazole,5.2,,12467714|6426943
94,14-3-3ZETA,36059.0,Fetal Death,MESH:D005313,,lead acetate,4.6,,11402670
95,14-3-3ZETA,36059.0,Fetal Growth Retardation,MESH:D005317,,lead acetate,4.24,,19769572
96,14-3-3ZETA,36059.0,Fetal Resorption,MESH:D005327,,lead acetate,5.02,,11402670
97,14-3-3ZETA,36059.0,Hearing Disorders,MESH:D006311,,lead acetate,4.64,,12062763
98,14-3-3ZETA,36059.0,Hematologic Diseases,MESH:D006402,,lead acetate,4.44,,25800560
99,14-3-3ZETA,36059.0,Hemolysis,MESH:D006461,,lead acetate,4.24,,19428946


In [21]:
# Must make some quick refinements to ensure resulting URLs work
df_gd['GeneID'] = df_gd.GeneID.astype(int) 
df_gd['DirectEvidence'] = df_gd.DirectEvidence.astype(str) 
df_gd['DiseaseID'] = df_gd['DiseaseID'].str.replace('MESH:', '')

# Create Predicate Column
def gd_predicate(r):
    """
    Create predicate
    """
    if r['DirectEvidence'] == "nan":
        return 'associated_by_inference_via_' + str(r.InferenceChemicalName).replace(' ', '_')
    else:
        return 'associated_directly_with'
    
df_gd['Predicate'] = df_gd.apply(gd_predicate, axis=1)

In [22]:
df_gd.Predicate.unique()

array(['associated_by_inference_via_Endocrine_Disruptors',
       'associated_by_inference_via_Water_Pollutants,_Chemical',
       'associated_by_inference_via_zinc_chloride',
       'associated_by_inference_via_Succimer',
       'associated_by_inference_via_Magnetite_Nanoparticles',
       'associated_by_inference_via_Pentylenetetrazole',
       'associated_by_inference_via_lead_acetate'], dtype=object)

In [23]:
df_gd.head()

Unnamed: 0,# GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs,Predicate
1,11-BETA-HSD3,100174880,"Abnormalities, Drug-Induced",D000014,,Endocrine Disruptors,5.15,,22659286,associated_by_inference_via_Endocrine_Disruptors
2,11-BETA-HSD3,100174880,Anemia,D000740,,"Water Pollutants, Chemical",4.19,,26546277,"associated_by_inference_via_Water_Pollutants,_..."
3,11-BETA-HSD3,100174880,"Anemia, Hemolytic",D000743,,"Water Pollutants, Chemical",4.48,,22425172,"associated_by_inference_via_Water_Pollutants,_..."
4,11-BETA-HSD3,100174880,Asthenozoospermia,D053627,,"Water Pollutants, Chemical",5.11,,25179371,"associated_by_inference_via_Water_Pollutants,_..."
5,11-BETA-HSD3,100174880,Birth Weight,D001724,,Endocrine Disruptors,5.82,,27152464|29518214,associated_by_inference_via_Endocrine_Disruptors


In [24]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/mesh/' 
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_gd, 'output_gd.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Gene Pathway

In [25]:
# Read in CTD sample, skipping the intro rows
df_gp = pd.read_csv('csvs/CTD_genes_pathways.csv', skiprows=27, nrows = 100)
df_gp = df_gp.drop(0)

In [26]:
df_gp.head()

Unnamed: 0,# GeneSymbol,GeneID,PathwayName,PathwayID
1,A1BG,1.0,Hemostasis,REACT:R-HSA-109582
2,A1BG,1.0,Immune System,REACT:R-HSA-168256
3,A1BG,1.0,Innate Immune System,REACT:R-HSA-168249
4,A1BG,1.0,Neutrophil degranulation,REACT:R-HSA-6798695
5,A1BG,1.0,"Platelet activation, signaling and aggregation",REACT:R-HSA-76002


In [27]:
# Must make some quick refinements to ensure resulting URLs work
df_gp['GeneID'] = df_gp.GeneID.astype(int) 
df_gp['PathwayID'] = df_gp['PathwayID'].str.replace('REACT:', '')

# Create Predicate Column
def gp_predicate(r):
    return 'associated_directly_with'
    
df_gp['Predicate'] = df_gp.apply(gp_predicate, axis=1)

In [28]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/reactome/' 
obj_col = 'PathwayID'
pred_col = 'Predicate'

convert_df_nt(df_gp, 'output_gp.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Disease Pathway

In [29]:
# Read in CTD sample, skipping the intro rows
df_dp = pd.read_csv('csvs/CTD_diseases_pathways.csv', skiprows=27, nrows = 100)
df_dp = df_dp.drop(0)

In [30]:
df_dp.head()

Unnamed: 0,# DiseaseName,DiseaseID,PathwayName,PathwayID,InferenceGeneSymbol
1,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,Androgen biosynthesis,REACT:R-HSA-193048,HSD17B3
2,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,"Fatty acid, triacylglycerol, and ketone body m...",REACT:R-HSA-535734,HSD17B3
3,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,Fatty Acyl-CoA Biosynthesis,REACT:R-HSA-75105,HSD17B3
4,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,Metabolic pathways,KEGG:hsa01100,HSD17B3
5,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,Metabolism,REACT:R-HSA-1430728,HSD17B3


In [31]:
# Must make some quick refinements to ensure resulting URLs work
df_dp['PathwayID'] = df_dp['PathwayID'].str.replace('REACT:', '')
df_dp['DiseaseID'] = df_dp['DiseaseID'].str.replace('MESH:', '')


# Create Predicate Column
def dp_predicate(r):
    return 'associated_by_inference_via_' + str(r.InferenceGeneSymbol)
    
df_dp['Predicate'] = df_dp.apply(dp_predicate, axis=1)

In [32]:
subj_url = 'http://identifiers.org/mesh/' 
subj_col = 'DiseaseID'
obj_url = 'http://identifiers.org/reactome/' 
obj_col = 'PathwayID'
pred_col = 'Predicate'

convert_df_nt(df_dp, 'output_dp.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Phenotype Chemical
I'm going to comment this section out as this is the Y data and shouldn't be in KG ( I think )

In [33]:
# Read in CTD sample, skipping the intro rows
# df_pc = pd.read_csv('csvs/CTD_pheno_term_ixns.csv', skiprows=27, nrows = 100)
# df_pc = df_pc.drop(0)
# df_pc[10:20]

Unnamed: 0,# chemicalname,chemicalid,casrn,phenotypename,phenotypeid,comentionedterms,organism,organismid,interaction,interactionactions,anatomyterms,inferencenetworkterms,pubmedids,Unnamed: 13
11,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,transmission of nerve impulse,GO:0019226,,Rattus norvegicus,10116.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",increases^phenotype,1^Hippocampus^D006624|2^Neurons^D009474|3^Cell...,KCNQ2^3785,26348896.0,
12,"10,12-octadecadienoic acid",C094849,,regulation of fatty acid metabolic process,GO:0019217,lard^C029310^MESH,Rattus norvegicus,10116.0,[lard affects regulation of fatty acid metabol...,affects^phenotype|increases^abundance,"1^Mammary Glands, Animal^D008321",,26115784.0,
13,"10-(2-(5H-(1,2,4)triazino(5,6-b)indol-3-ylthio...",C573921,,cell proliferation,GO:0008283,resveratrol^C059514^MESH|Sodium Fluoride^D0129...,Mus musculus,10090.0,"10-(2-(5H-(1,2,4)triazino(5,6-b)indol-3-ylthio...",decreases^phenotype|decreases^reaction,1^Ameloblasts^D000565|2^Cell Line^D002460,,24296261.0,
14,10-((3-hydroxy-4-methoxybenzylidene))-9(10H)-a...,C476444,,release from viral latency,GO:0019046,,Homo sapiens,9606.0,10-((3-hydroxy-4-methoxybenzylidene))-9(10H)-a...,increases^phenotype,"1^Cell Line, Transformed^D002461",,26225566.0,
15,10-((3-hydroxy-4-methoxybenzylidene))-9(10H)-a...,C476444,,release from viral latency,GO:0019046,bryostatin 1^C046785^MESH,Homo sapiens,9606.0,bryostatin 1 promotes the reaction [10-((3-hyd...,increases^phenotype|increases^reaction,"1^Cell Line, Transformed^D002461",,26225566.0,
16,10-((3-hydroxy-4-methoxybenzylidene))-9(10H)-a...,C476444,,release from viral latency,GO:0019046,prostratin^C070999^MESH,Homo sapiens,9606.0,prostratin promotes the reaction [10-((3-hydro...,increases^phenotype|increases^reaction,"1^Cell Line, Transformed^D002461",,26225566.0,
17,10-(4'-(N-diethylamino)butyl)-2-chlorophenoxazine,C553100,,cell proliferation,GO:0008283,cobaltous chloride^C018021^MESH|HCRT^3060^GENE,Rattus norvegicus,10116.0,10-(4'-(N-diethylamino)butyl)-2-chlorophenoxaz...,decreases^phenotype|decreases^reaction,1^Embryonic Structures^D004628|2^Neurons^D0094...,THPO^7066,24243084.0,
18,10-(4'-(N-diethylamino)butyl)-2-chlorophenoxazine,C553100,,positive regulation of apoptotic process,GO:0043065,Tetradecanoylphorbol Acetate^D013755^MESH,Homo sapiens,9606.0,[10-(4'-(N-diethylamino)butyl)-2-chlorophenoxa...,affects^cotreatment|increases^phenotype,"1^Erythroblasts^D004900|2^Cell Line, Tumor^D04...",,20523355.0,
19,10-(4'-(N-diethylamino)butyl)-2-chlorophenoxazine,C553100,,positive regulation of megakaryocyte different...,GO:0045654,THPO^7066^GENE,Homo sapiens,9606.0,10-(4'-(N-diethylamino)butyl)-2-chlorophenoxaz...,decreases^reaction|increases^phenotype,"1^Hematopoietic Stem Cells^D006412|2^Cells, Cu...",THPO^7066,20523355.0,
20,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,,cellular respiration,GO:0045333,SOD2^6648^GENE,Rattus norvegicus,10116.0,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,decreases^reaction|increases^phenotype,"1^Kidney^D007668|2^Cells, Cultured^D002478|3^M...",CYCS^54205|PPARGC1A^10891,24563852.0,


In [34]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
# s = df_pc['interactionactions'].str.split('|').apply(pd.Series, 1).stack()
# s.index = s.index.droplevel(-1)
# s.name = 'interactionactions'
# df_pc = df_pc.join(s.apply(lambda x: pd.Series(x.split('|'))))
# df_pc = df_pc.rename(columns={0: 'Predicate'})
# df_pc['Predicate'] = df_pc.Predicate.str.replace('^', '_')
# df_pc['Predicate'] = df_pc.Predicate.str.replace(' ', '_')

In [35]:
# subj_url = 'http://identifiers.org/ctd.chemical/'  
# subj_col = 'chemicalid'
# obj_url = 'http://identifiers.org/go/' 
# obj_col = 'phenotypeid'
# pred_col = 'Predicate'

# convert_df_nt(df_pc, 'output_pc.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Merge NT files

In [36]:
subprocess.call('cat *.nt > master.nt', shell=True)

0