# Create NT files from CTD csvs
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Takes local CTD csvs and turns each into a .nt file

<b>Notes:</b>
To run this for visualising and investigating the data you'll need to ensure that each pd.read_csv line takes the argument   or some other small number <br>
To run it for the intended purpose of converting the DBs to RDF make sure that the nrows argument is not being passed anywhere

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import subprocess
import math
import pickle

In [2]:
# Begin log file (progress etc so can be checked from ssh)
subprocess.call('echo "Date: " `date` > log.txt', shell=True)

0

## Functions

In [13]:
# def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=0):
#     """
#     Input:
#         DF: some rows and columns of a dataframe
#         STR: name for the output file, include filetype .nt
#         STR: subj_url is the url to be used for all subjects
#         STR: subj_col is the column from which to get the id for the subj
#         STR: obj_url is the url to be used for all objects
#         STR: obj_col is the column from which to get the id for the obj
#         STR: OPTIONAL odd_url is for when a subset obj/subj of require a different url 
#     Output:
#         NT file
#     """
#     f = open(output_file,'w')
#     for index, row in df.iterrows():
#         subj = '<' + subj_url + row[subj_col] + '> '
#         pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
#         if 'OMIM' in row[obj_col]:
#             row[obj_col] = row[obj_col].replace('OMIM:', '')
#             obj = '<' + odd_url + row[obj_col] + '> '
#         else:
#             obj = '<' + obj_url + row[obj_col] + '> '
#         f.write(subj + pred + obj + '.' + '\n')
#     f.close()

In [2]:
# Version 2 of this function. It aims to include predicate code if provided, preferentially
# to handmade URI. ALSO now accounts for kegg in the pathway columns

def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=['~`']):
    """
    Input:
        DF: some rows and columns of a dataframe
        STR: name for the output file, include filetype .nt
        STR: subj_url is the url to be used for all subjects
        STR: subj_col is the column from which to get the id for the subj
        STR: obj_url is the url to be used for all objects
        STR: obj_col is the column from which to get the id for the obj
        List: OPTIONAL odd_url is for when a subset obj/subj of require a different url
        it should be a list of [substr to remove, URL]
    Output:
        NT file
    """
    # establish boolean to determine whether df has predicate codes
    pred_code = False 
    if 'pred_code' in list(df.columns): 
        pred_code = True
        
    f = open(output_file,'w')
    for index, row in df.iterrows():
        subj = '<' + subj_url + row[subj_col] + '> '
        if row.pred_code == 'nan':
            pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
        else:
            pred = '<' + 'http://purl.obolibrary.org/obo/' + row['pred_code'] + '> '
        if odd_url[0] in row[obj_col]:
            row[obj_col] = row[obj_col].replace(odd_url[0], '')
            obj = '<' + odd_url[1] + row[obj_col] + '> '
        else:
            obj = '<' + obj_url + row[obj_col] + '> '
        f.write(subj + pred + obj + '.' + '\n')
    f.close()

In [15]:
#TODO rework this to vectorise it
# def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=0):
#     """
#     Input:
#         DF: some rows and columns of a dataframe
#         STR: name for the output file, include filetype .nt
#         STR: subj_url is the url to be used for all subjects
#         STR: subj_col is the column from which to get the id for the subj
#         STR: obj_url is the url to be used for all objects
#         STR: obj_col is the column from which to get the id for the obj
#         STR: OPTIONAL odd_url is for when a subset obj/subj of require a different url 
#     Output:
#         NT file
#     """
#     f = open(output_file,'w')
#     for index, row in df.iterrows():
#         subj = '<' + subj_url + row[subj_col] + '> '
#         pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
#         if 'OMIM' in row[obj_col]:
#             row[obj_col] = row[obj_col].replace('OMIM:', '')
#             obj = '<' + odd_url + row[obj_col] + '> '
#         else:
#             obj = '<' + obj_url + row[obj_col] + '> '
#         f.write(subj + pred + obj + '.' + '\n')
#     f.close()

## Download Databases

In [16]:
# # subprocess.call('pip3 install wget', shell=True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chem_gene_ixns.csv.gz', shell=True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chemicals_diseases.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chem_pathways_enriched.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_genes_diseases.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_genes_pathways.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_diseases_pathways.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_pheno_term_ixns.csv.gz', shell = True)

In [17]:
# # Move all the csvs to a subfolder and unzip them
# subprocess.call('mkdir csvs', shell=True)
# subprocess.call('mv *.gz csvs/', shell=True)
# subprocess.call('gunzip csvs/*.gz', shell=True)

In [18]:
# too ambitious?? 
# TODO attempt to make one func to import and process all ctd databases
# def ctd_to_rdf(csv, output_file, subj_url, subj_col, obj_url, obj_col, pred_col):
#     """
#     """
#     df = pd.read_csv(csv, skiprows=27 )
#     df = df.drop(0)
#     convert_df_nt(df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col)

## CHEM-GENE 

In [19]:
# Log progress
subprocess.call('echo "Begin Chem-Gene" >> log.txt', shell=True)

0

In [None]:
# Read in CTD sample, skipping the intro rows8888
df_cg = pd.read_csv('csvs/CTD_chem_gene_ixns.csv', skiprows=27, nrows=1000)
df_cg = df_cg.drop(0)
df_cg = df_cg.rename(columns={'# ChemicalName': 'ChemicalName'}) # rename of a column

In [13]:
print(df_cg.shape)

(1678128, 11)


In [None]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
s = df_cg['InteractionActions'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'InteractionActions'
df_cg = df_cg.join(s.apply(lambda x: pd.Series(x.split('|'))))

In [None]:
df_cg.head()

In [None]:
# Make the new column prettier
df_cg = df_cg.rename(columns={0: 'Predicate'})
df_cg['Predicate'] = df_cg.Predicate.str.replace('^', '_')
df_cg['Predicate'] = df_cg.Predicate.str.replace(' ', '_')

In [None]:
# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

#Specify type to optimise
df_cg['ChemicalID'] = df_cg.ChemicalID.astype(str)
df_cg['GeneID'] = df_cg.GeneID.astype(str)

In [None]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'increases_expression' : 'RO_0002213',
    'decreases_activity' : 'RO_0002212',
    'decreases_expression': 'RO_0002212',
    'increases_activity': 'RO_0002213',
    'increases_stability': 'RO_0002213',
    'decreases_stability': 'RO_0002212',
    'increases_abundance': 'RO_0002213',
    'decreases_abundance': 'RO_0002212',
    'increases_degradation': 'RO_0002212',
    'decreases_degradation': 'RO_0002213'
}

# Apply map to create ro_predicate value for applicable predicates
df_cg['pred_code'] = df_cg.Predicate.map(map_to_ro).astype(str)

In [None]:
print(df_cg.Predicate.value_counts())

In [None]:
len(df_cg)
# len(df_cg[df_cg.OrganismID == 9606.0])

In [None]:
# This cell makes the association file for opa2vec, limiting to humans and limiting to positive regulation
# of a gene


In [None]:
df_cg[10:30]

In [None]:
print(df_cg.shape)

In [None]:
# Export map of Chems:Genes
chem_gene_map = dict(zip(df_cg.ChemicalID, df_cg.GeneID))

with open('CHEM_GENE_map'+ '.pkl', 'wb') as f:
        pickle.dump(chem_gene_map, f, pickle.HIGHEST_PROTOCOL)

In [None]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/ctd.gene/' 
obj_col = 'GeneID'
pred_col = 'Predicate'

convert_df_nt(df_cg, 'output_cg.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Chem-Disease

In [None]:
# Log progress
subprocess.call('echo "Begin Chem-Dis" >> log.txt', shell=True)

In [3]:
# Read in CTD sample, skipping the intro rows
cols = ['ChemicalID', 'DiseaseID', 'DirectEvidence']
col_types = {   
    'ChemicalID': 'category',
    'DiseaseID': 'category',
    'DirectEvidence': 'category'
}
df_cd = pd.read_csv('csvs/CTD_chemicals_diseases.csv', skiprows=27, usecols=cols, dtype=col_types)
df_cd = df_cd.drop(0)
df_cd = df_cd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

In [4]:
df_cd.head()

Unnamed: 0,ChemicalID,DiseaseID,DirectEvidence
1,C046983,MESH:D054198,therapeutic
71,C112297,MESH:D006948,marker/mechanism
86,C112297,MESH:D012640,marker/mechanism
135,C039775,MESH:D004827,therapeutic
189,C425777,MESH:D006948,marker/mechanism


In [16]:
# Create list of all chems and diseases for use elsewhere
allChems = list(df_cd.ChemicalID.unique())
allDis =  list(df_cd.DiseaseID.unique())
all_chems_all_dis = np.asarray(allChems + allDis)
np.savetxt(r'all-ctd_chems_dis.txt', all_chems_all_dis, fmt='%s')

In [None]:
# ## Create chem-dis file with only positive, directly-evidenced associations (for use in opa-nn)
df1 = df_cd[df_cd.DirectEvidence == 'marker/mechanism']
df1 = df1[['ChemicalID', 'DiseaseID']]
df1.to_csv('chem-dis-pos-assocs.csv', index=False)

In [None]:
# Process DiseaseID so as to be usable in url
df_cd['DiseaseID'] = df_cd['DiseaseID'].str.replace('MESH:', '')

#Specify type to optimise
df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# df_cd['InferenceGeneSymbol'] = df_cd.InferenceGeneSymbol.astype(str)

In [None]:
# Create Predicate Column
def cd_predicate(r):
    """
    Create predicate from directevidence if available
    """
    return 'associated_directly_with'

# df_cd['DirectEvidence'] = df_cd.DirectEvidence.astype(str) 
df_cd['Predicate'] = df_cd.apply(cd_predicate, axis=1)
df_cd['Predicate'] = df_cd.Predicate.astype(str)

In [None]:
# #Specify type to optimise
# df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# df_cd['DiseaseID'] = df_cd.DiseaseID.astype(str)

df_cd.DirectEvidence.unique()


In [None]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# RO_0003308 = correlated with condition
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'therapeutic': 'RO_0002212',
    'marker/mechanism': 'RO_0003308' 
}

# Apply map to create ro_predicate value for applicable predicates
df_cd['pred_code'] = df_cd.DirectEvidence.map(map_to_ro).astype(str)

In [None]:
# # Exporting diseaseID and Disease Name for mapping DisGeNet UMLS codes to MESH
# df_cd_mesh = df_cd.loc[:, ('DiseaseName', 'DiseaseID')]
# df_cd_mesh.to_csv('CTDdis-names.csv', index=False)

In [None]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/mesh/'
obj_url_2 = 'http://identifiers.org/omim/' # to use when CTD gives an omim disease id
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_cd, 'output_cd.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['OMIM:',obj_url_2])

## Gene Disease

In [None]:
# Log progress
subprocess.call('echo "Begin Gene-Dis" >> log.txt', shell=True)

In [None]:
# Read in CTD sample, skipping the intro rows
# TODO probably faster to pre-process out rows without direct evidence:
# awk  '$5!=""' CTD_genes_diseases.csv > CTD_genes_diseases.csv
cols = ['GeneID', 'DiseaseID', 'DirectEvidence']
col_types = {   
    'GeneID': 'category',
    'DiseaseID': 'category',
    'DirectEvidence': 'category'
}
df_gd = pd.read_csv('csvs/CTD_genes_diseases.csv', skiprows=27, usecols=cols, dtype=col_types)
df_gd = df_gd.drop(0)
df_gd = df_gd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

In [None]:
# # Export a list of all diseases that have genes
# ctd_diseases = df_gd.DiseaseID.unique()
# np.savetxt(r'all-diseases-w-genes-ctd.txt', ctd_diseases, fmt='%s')

In [None]:
#Specify type to optimise
df_gd['GeneID'] = df_gd.GeneID.astype(str)
df_gd['DiseaseID'] = df_gd.DiseaseID.astype(str)

In [None]:
# Export map of Chems:Genes
dis_gene_map = dict(zip(df_gd.DiseaseID, df_cg.GeneID))

with open('DIS_GENE_map'+ '.pkl', 'wb') as f:
        pickle.dump(dis_gene_map, f, pickle.HIGHEST_PROTOCOL)

In [None]:
## Create gene-dis file with only positive, directly-evidenced associations (for use in opa-nn)
df1 = df_gd[df_gd.DirectEvidence == 'marker/mechanism']
df1 = df1[['GeneID', 'DiseaseID']]
df1.to_csv('gene-dis-pos-assocs.csv', index=False)

In [None]:
df_gd.head()

In [None]:
df_gd[df_gd.DirectEvidence == 'marker/mechanism'].shape

In [None]:
df_gd.head()

In [None]:
# Must make some quick refinements to ensure resulting URLs work
df_gd['GeneID'] = df_gd.GeneID.astype(float) 
df_gd['GeneID'] = df_gd['GeneID'].fillna(0).astype(int)
df_gd['GeneID'] = df_gd['GeneID'].astype(str)
df_gd['DirectEvidence'] = df_gd.DirectEvidence.astype(str) 
df_gd['DiseaseID'] = df_gd['DiseaseID'].str.replace('MESH:', '')

# Create Predicate Column
def gd_predicate(r):
    """
    Create predicate
    """
    return 'associated_directly_with'
    
df_gd['Predicate'] = df_gd.apply(gd_predicate, axis=1)
df_gd['Predicate'] = df_gd.Predicate.astype(str)

In [None]:
df_gd.head()
df_gd.DirectEvidence.unique()

In [None]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# RO_0003308 = correlated with condition
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'therapeutic': 'RO_0002212',
    'marker/mechanism': 'RO_0003308' 
}

# Apply map to create ro_predicate value for applicable predicates
df_gd['pred_code'] = df_gd.DirectEvidence.map(map_to_ro).astype(str)

In [None]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/mesh/' 
obj_url_2 = 'http://identifiers.org/omim/' # to use when CTD gives an omim disease id
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_gd, 'output_gd.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['OMIM:',obj_url_2])

## Gene Pathway

In [None]:
# Log progress
subprocess.call('echo "Begin Gene-Path" >> log.txt', shell=True)

In [2]:
# Read in CTD sample, skipping the intro rows
df_gp = pd.read_csv('csvs/CTD_genes_pathways.csv', skiprows=27, nrows=1000 )
df_gp = df_gp.drop(0)

In [9]:
df_gp.sample(14)

Unnamed: 0,# GeneSymbol,GeneID,PathwayName,PathwayID
998,ACOT4,122970.0,Metabolism,REACT:R-HSA-1430728
952,ACO2,50.0,Citric acid cycle (TCA cycle),REACT:R-HSA-71403
573,ABR,29.0,Signaling by Rho GTPases,REACT:R-HSA-194315
430,ABCG5,64240.0,Transmembrane transport of small molecules,REACT:R-HSA-382551
169,ABAT,18.0,Metabolic pathways,KEGG:hsa01100
668,ACACB,32.0,Pyruvate metabolism,KEGG:hsa00620
987,ACOT2,10965.0,Fatty Acyl-CoA Biosynthesis,REACT:R-HSA-75105
242,ABCB10,23456.0,ABC transporters,KEGG:hsa02010
736,ACADSB,36.0,Fatty acid degradation,KEGG:hsa00071
23,A2M,2.0,"Platelet activation, signaling and aggregation",REACT:R-HSA-76002


In [None]:
# Must make some quick refinements to ensure resulting URLs work
df_gp['GeneID'] = df_gp['GeneID'].fillna(0).astype(int)
df_gp['GeneID'] = df_gp.GeneID.astype(int) 
df_gp['PathwayID'] = df_gp['PathwayID'].str.replace('REACT:', '')

# Create Predicate Column
def gp_predicate(r):
    return 'RO_0002213'
    
df_gp['pred_code'] = df_gp.apply(gp_predicate, axis=1)

In [None]:
#Specify type to optimise
df_gp['GeneID'] = df_gp.GeneID.astype(str)
df_gp['PathwayID'] = df_gp.PathwayID.astype(str)
df_gp['pred_code'] = df_gp.pred_code.astype(str)

In [None]:
df_gp.iloc[80:100]

In [None]:
# # Painstakingly split the predicates into the available RO/GO predicates... will do only a
# # partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# # RO_0002213 = positively regulates
# # RO_0002212 = negatively regulates 
# # RO_0003308 = correlated with condition
# # http://purl.obolibrary.org/obo/RO_0002213

# map_to_ro = {
#     'therapeutic': 'RO_0002212',
#     'marker/mechanism': 'RO_0003308' 
# }

# # Apply map to create ro_predicate value for applicable predicates
# df_gd['pred_code'] = df_gd.DirectEvidence.map(map_to_ro).astype(str)

In [None]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/reactome/' 
obj_col = 'PathwayID'
pred_col = 'pred_code'
obj_url_2 = 'http://identifiers.org/kegg.pathway/'

convert_df_nt(df_gp, 'output_gp.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['KEGG:',obj_url_2])

### Chem Pathway Enriched

In [7]:
df_cpath = pd.read_csv('csvs/CTD_chem_pathways_enriched.csv.gz', skiprows=27, nrows=1000)
df_cpath = df_cpath.drop(0)

In [8]:
df_cpath.head()

Unnamed: 0,# ChemicalName,ChemicalID,CasRN,PathwayName,PathwayID,PValue,CorrectedPValue,TargetMatchQty,TargetTotalQty,BackgroundMatchQty,BackgroundTotalQty
0,#,,,,,,,,,,
1,10074-G5,C534883,,"Cell Cycle, Mitotic",REACT:R-HSA-69278,0.000142,0.00924,2.0,2.0,514.0,43067.0
2,10074-G5,C534883,,Cyclin A:Cdk2-associated events at S phase entry,REACT:R-HSA-69656,3e-06,0.000174,2.0,2.0,71.0,43067.0
3,10074-G5,C534883,,Cyclin E associated events during G1/S transition,REACT:R-HSA-69202,3e-06,0.000179,2.0,2.0,72.0,43067.0
4,10074-G5,C534883,,G1/S Transition,REACT:R-HSA-69206,8e-06,0.000492,2.0,2.0,119.0,43067.0


## Disease Pathway
Commenting out as is inferred

In [None]:
# # Log progress
# subprocess.call('echo "Begin Dis-Path" >> log.txt', shell=True)

In [4]:
# Read in CTD sample, skipping the intro rows
df_dp = pd.read_csv('csvs/CTD_diseases_pathways.csv', skiprows=27, nrows=1000 )
df_dp = df_dp.drop(0)

In [5]:
df_dp.sample(13
            )

Unnamed: 0,# DiseaseName,DiseaseID,PathwayName,PathwayID,InferenceGeneSymbol
260,3-methylcrotonyl CoA carboxylase 1 deficiency,MESH:C535308,Metabolism of water-soluble vitamins and cofac...,REACT:R-HSA-196849,MCCC1
937,"Abnormalities, Multiple",MESH:D000015,Downstream signaling events of B Cell Receptor...,REACT:R-HSA-1168372,PTEN
925,"Abnormalities, Multiple",MESH:D000015,DNA Damage Bypass,REACT:R-HSA-73893,POLD1
648,Aberrant Crypt Foci,MESH:D058739,Signaling by WNT in cancer,REACT:R-HSA-4791275,CTNNB1
467,ABCD syndrome,MESH:C535334,Gastrin-CREB signalling pathway via PKC and MAPK,REACT:R-HSA-881907,EDNRB
952,"Abnormalities, Multiple",MESH:D000015,Endometrial cancer,KEGG:hsa05213,CTNNB1
548,Aberrant Crypt Foci,MESH:D058739,AXIN missense mutants destabilize the destruct...,REACT:R-HSA-5467340,APC
107,22q11 Deletion Syndrome,MESH:D058165,SHC1 events in EGFR signaling,REACT:R-HSA-180336,FGF8
791,"Abnormalities, Drug-Induced",MESH:D000014,Signalling by NGF,REACT:R-HSA-166520,RHOA
766,"Abnormalities, Drug-Induced",MESH:D000014,Rap1 signaling pathway,KEGG:hsa04015,RHOA


In [None]:
# #Specify type to optimise
# df_dp['DiseaseID'] = df_dp.DiseaseID.astype(str)
# df_dp['PathwayID'] = df_dp.PathwayID.astype(str)
# df_dp['InferenceGeneSymbol'] = df_dp.InferenceGeneSymbol.astype(str)

In [None]:
# # Must make some quick refinements to ensure resulting URLs work
# df_dp['PathwayID'] = df_dp['PathwayID'].str.replace('REACT:', '')
# df_dp['DiseaseID'] = df_dp['DiseaseID'].str.replace('MESH:', '')


# # Create Predicate Column
# def dp_predicate(r):
#     return 'associated_directly_with'
    
# df_dp['Predicate'] = df_dp.apply(dp_predicate, axis=1)
# df_dp['Predicate'] = df_dp.Predicate.astype(str)

In [None]:
# df_dp.head()

In [None]:
# subj_url = 'http://identifiers.org/mesh/' 
# subj_col = 'DiseaseID'
# obj_url = 'http://identifiers.org/reactome/' 
# obj_col = 'PathwayID'
# pred_col = 'Predicate'

# convert_df_nt(df_dp, 'output_dp.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

In [None]:
# df_dp

## Phenotype Chemical
I'm going to comment this section out as this is the Y data and shouldn't be in KG ( I think )

In [None]:
# Read in CTD sample, skipping the intro rows
# df_pc = pd.read_csv('csvs/CTD_pheno_term_ixns.csv', skiprows=27  )
# df_pc = df_pc.drop(0)
# df_pc[10:20]

In [None]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
# s = df_pc['interactionactions'].str.split('|').apply(pd.Series, 1).stack()
# s.index = s.index.droplevel(-1)
# s.name = 'interactionactions'
# df_pc = df_pc.join(s.apply(lambda x: pd.Series(x.split('|'))))
# df_pc = df_pc.rename(columns={0: 'Predicate'})
# df_pc['Predicate'] = df_pc.Predicate.str.replace('^', '_')
# df_pc['Predicate'] = df_pc.Predicate.str.replace(' ', '_')

In [None]:
# subj_url = 'http://identifiers.org/ctd.chemical/'  
# subj_col = 'chemicalid'
# obj_url = 'http://identifiers.org/go/' 
# obj_col = 'phenotypeid'
# pred_col = 'Predicate'

# convert_df_nt(df_pc, 'output_pc.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Merge NT files

In [None]:
# Log progress
# import subprocess
subprocess.call('echo "Begin Merging" >> log.txt', shell=True)

In [None]:
subprocess.call('cat *.nt > master.nt', shell=True)

In [None]:
# Log progress
subprocess.call('echo "Finished Merging" >> log.txt', shell=True)

Note that Jena doesn't accept a percentage sign unless it's followed by two hexadecimals, so you can run the following to replace the % sign with the word percentage

In [None]:
# subproces.call("sed -i '/%/ s//percent/g' master.nt", shell=True)

In [None]:
# # You'll need to install Apache Jena for this
# subprocess.call('riot --output=RDFXML master.nt > master.rdf', shell=True)