#### Create NT files from CTD csvs
<b>Author</b>: Ian Coleman <br/>
<b>Function</b>: Takes local CTD csvs and turns each into a .nt file

<b>Notes:</b>
To run this for visualising and investigating the data you'll need to ensure that each pd.read_csv line takes the argument   or some other small number <br>
To run it for the intended purpose of converting the DBs to RDF make sure that the nrows argument is not being passed anywhere

In [1]:
# %reset

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import subprocess
import math

In [3]:
# Begin log file (progress etc so can be checked from ssh)
subprocess.call('echo "Date: " `date` > log.txt', shell=True)

0

## Functions

In [29]:
# def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=0):
#     """
#     Input:
#         DF: some rows and columns of a dataframe
#         STR: name for the output file, include filetype .nt
#         STR: subj_url is the url to be used for all subjects
#         STR: subj_col is the column from which to get the id for the subj
#         STR: obj_url is the url to be used for all objects
#         STR: obj_col is the column from which to get the id for the obj
#         STR: OPTIONAL odd_url is for when a subset obj/subj of require a different url 
#     Output:
#         NT file
#     """
#     f = open(output_file,'w')
#     for index, row in df.iterrows():
#         subj = '<' + subj_url + row[subj_col] + '> '
#         pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
#         if 'OMIM' in row[obj_col]:
#             row[obj_col] = row[obj_col].replace('OMIM:', '')
#             obj = '<' + odd_url + row[obj_col] + '> '
#         else:
#             obj = '<' + obj_url + row[obj_col] + '> '
#         f.write(subj + pred + obj + '.' + '\n')
#     f.close()

In [30]:
# Version 2 of this function. It aims to include predicate code if provided, preferentially
# to handmade URI. ALSO now accounts for kegg in the pathway columns

def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=['~`']):
    """
    Input:
        DF: some rows and columns of a dataframe
        STR: name for the output file, include filetype .nt
        STR: subj_url is the url to be used for all subjects
        STR: subj_col is the column from which to get the id for the subj
        STR: obj_url is the url to be used for all objects
        STR: obj_col is the column from which to get the id for the obj
        List: OPTIONAL odd_url is for when a subset obj/subj of require a different url
        it should be a list of [substr to remove, URL]
    Output:
        NT file
    """
    # establish boolean to determine whether df has predicate codes
    pred_code = False 
    if 'pred_code' in list(df.columns): 
        pred_code = True
        
    f = open(output_file,'w')
    for index, row in df.iterrows():
        subj = '<' + subj_url + row[subj_col] + '> '
        if row.pred_code == 'nan':
            pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
        else:
            pred = '<' + 'http://purl.obolibrary.org/obo/' + row['pred_code'] + '> '
        if odd_url[0] in row[obj_col]:
            row[obj_col] = row[obj_col].replace(odd_url[0], '')
            obj = '<' + odd_url[1] + row[obj_col] + '> '
        else:
            obj = '<' + obj_url + row[obj_col] + '> '
        f.write(subj + pred + obj + '.' + '\n')
    f.close()

In [31]:
#TODO rework this to vectorise it
# def convert_df_nt (df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col, odd_url=0):
#     """
#     Input:
#         DF: some rows and columns of a dataframe
#         STR: name for the output file, include filetype .nt
#         STR: subj_url is the url to be used for all subjects
#         STR: subj_col is the column from which to get the id for the subj
#         STR: obj_url is the url to be used for all objects
#         STR: obj_col is the column from which to get the id for the obj
#         STR: OPTIONAL odd_url is for when a subset obj/subj of require a different url 
#     Output:
#         NT file
#     """
#     f = open(output_file,'w')
#     for index, row in df.iterrows():
#         subj = '<' + subj_url + row[subj_col] + '> '
#         pred = '<' + 'http://ian.ie/' + row[pred_col] + '> '
#         if 'OMIM' in row[obj_col]:
#             row[obj_col] = row[obj_col].replace('OMIM:', '')
#             obj = '<' + odd_url + row[obj_col] + '> '
#         else:
#             obj = '<' + obj_url + row[obj_col] + '> '
#         f.write(subj + pred + obj + '.' + '\n')
#     f.close()

## Download Databases

In [32]:
# # subprocess.call('pip3 install wget', shell=True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chem_gene_ixns.csv.gz', shell=True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chemicals_diseases.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_chem_pathways_enriched.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_genes_diseases.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_genes_pathways.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_diseases_pathways.csv.gz', shell = True)
# subprocess.call('wget http://ctdbase.org/reports/CTD_pheno_term_ixns.csv.gz', shell = True)

In [33]:
# # Move all the csvs to a subfolder and unzip them
# subprocess.call('mkdir csvs', shell=True)
# subprocess.call('mv *.gz csvs/', shell=True)
# subprocess.call('gunzip csvs/*.gz', shell=True)

In [34]:
# too ambitious?? 
# TODO attempt to make one func to import and process all ctd databases
# def ctd_to_rdf(csv, output_file, subj_url, subj_col, obj_url, obj_col, pred_col):
#     """
#     """
#     df = pd.read_csv(csv, skiprows=27 )
#     df = df.drop(0)
#     convert_df_nt(df, output_file, subj_url, subj_col, obj_url, obj_col, pred_col)

## CHEM-GENE 

In [35]:
# Log progress
subprocess.call('echo "Begin Chem-Gene" >> log.txt', shell=True)

0

In [2]:
# Read in CTD sample, skipping the intro rows8888
df_cg = pd.read_csv('csvs/CTD_chem_gene_ixns.csv', skiprows=27, nrows=10000)
df_cg = df_cg.drop(0)
df_cg = df_cg.rename(columns={'# ChemicalName': 'ChemicalName'}) # rename of a column

In [3]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
s = df_cg['InteractionActions'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'InteractionActions'
df_cg = df_cg.join(s.apply(lambda x: pd.Series(x.split('|'))))

In [4]:
# Make the new column prettier
df_cg = df_cg.rename(columns={0: 'Predicate'})
df_cg['Predicate'] = df_cg.Predicate.str.replace('^', '_')
df_cg['Predicate'] = df_cg.Predicate.str.replace(' ', '_')

In [5]:
# Need to change float to int for the url to work
df_cg['GeneID'] = df_cg.GeneID.astype(int)

#Specify type to optimise
df_cg['ChemicalID'] = df_cg.ChemicalID.astype(str)
df_cg['GeneID'] = df_cg.GeneID.astype(str)

In [6]:
df_cg.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs,Predicate
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287,affects_binding
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287,affects_folding
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287,decreases_activity
2,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287,affects_binding
2,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287,decreases_reaction


In [40]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'increases_expression' : 'RO_0002213',
    'decreases_activity' : 'RO_0002212',
    'decreases_expression': 'RO_0002212',
    'increases_activity': 'RO_0002213',
    'increases_stability': 'RO_0002213',
    'decreases_stability': 'RO_0002212',
    'increases_abundance': 'RO_0002213',
    'decreases_abundance': 'RO_0002212',
    'increases_degradation': 'RO_0002212',
    'decreases_degradation': 'RO_0002213'
}

# Apply map to create ro_predicate value for applicable predicates
df_cg['pred_code'] = df_cg.Predicate.map(map_to_ro).astype(str)

In [44]:
print(df_cg.Predicate.value_counts())

increases_expression               4442
decreases_expression               3059
decreases_reaction                 1784
increases_reaction                  768
affects_binding                     727
increases_activity                  667
affects_cotreatment                 613
increases_phosphorylation           288
decreases_activity                  284
affects_expression                  271
affects_reaction                    191
increases_cleavage                  105
affects_response_to_substance       102
increases_metabolic_processing       90
increases_chemical_synthesis         87
affects_localization                 84
decreases_phosphorylation            79
increases_abundance                  69
increases_response_to_substance      60
decreases_response_to_substance      60
increases_secretion                  58
affects_metabolic_processing         40
increases_oxidation                  31
increases_degradation                21
affects_activity                     20


In [50]:
len(df_cg)
# len(df_cg[df_cg.OrganismID == 9606.0])

14204

In [None]:
# This cell makes the association file for opa2vec, limiting to humans and limiting to positive regulation
# of a gene


In [48]:
df_cg[10:30]

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs,Predicate,pred_code
6,10074-G5,C534883,,MYC,4609,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287,decreases_activity,RO_0002212
7,10074-G5,C534883,,MYC,4609,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287,affects_binding,
7,10074-G5,C534883,,MYC,4609,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287,decreases_reaction,
8,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,FOS,2353,protein,Mus musculus,10090.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",decreases^reaction|increases^expression,26348896,decreases_reaction,
8,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,FOS,2353,protein,Mus musculus,10090.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",decreases^reaction|increases^expression,26348896,increases_expression,RO_0002213
9,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,KCNQ1,3784,protein,,,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",decreases^activity,18568022,decreases_activity,RO_0002212
10,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,KCNQ2,3785,protein,Mus musculus,10090.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",affects^reaction|increases^activity|increases^...,15634793,affects_reaction,
10,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,KCNQ2,3785,protein,Mus musculus,10090.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",affects^reaction|increases^activity|increases^...,15634793,increases_activity,RO_0002213
10,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,KCNQ2,3785,protein,Mus musculus,10090.0,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",affects^reaction|increases^activity|increases^...,15634793,increases_import,
11,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,KCNQ2,3785,protein,,,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthraceno...",affects^binding|decreases^reaction|increases^t...,20208034,affects_binding,


In [190]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/ctd.gene/' 
obj_col = 'GeneID'
pred_col = 'Predicate'

convert_df_nt(df_cg, 'output_cg.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Chem-Disease

In [12]:
# Log progress
subprocess.call('echo "Begin Chem-Dis" >> log.txt', shell=True)

0

In [3]:
# Read in CTD sample, skipping the intro rows
df_cd = pd.read_csv('csvs/CTD_chemicals_diseases.csv', skiprows=27)
df_cd = df_cd.drop(0)
df_cd = df_cd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

In [4]:
print('Number of diseases:')
df_cd[(df_cd.DirectEvidence == 'marker/mechanism') | (df_cd.DirectEvidence == 'therapeutic')].DiseaseID.nunique()

Number of diseases:


3191

In [5]:
# ## Create chem-dis file with only directly-evidenced associations (for use in opa-nn)
df1 = df_cd[(df_cd.DirectEvidence == 'marker/mechanism') | (df_cd.DirectEvidence == 'therapeutic')]
df1 = df1[['ChemicalID', 'DiseaseID', 'DirectEvidence']]
df1.to_csv('chem-dis-pos-assocs.csv', index=False)

In [6]:
df1.shape

(96086, 3)

In [8]:
df1.head()

Unnamed: 0,ChemicalID,DiseaseID,DirectEvidence
1,C046983,MESH:D054198,therapeutic
71,C112297,MESH:D006948,marker/mechanism
86,C112297,MESH:D012640,marker/mechanism
135,C039775,MESH:D004827,therapeutic
189,C425777,MESH:D006948,marker/mechanism


In [5]:
# Process DiseaseID so as to be usable in url
df_cd['DiseaseID'] = df_cd['DiseaseID'].str.replace('MESH:', '')

#Specify type to optimise
df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# df_cd['InferenceGeneSymbol'] = df_cd.InferenceGeneSymbol.astype(str)

In [6]:
df_cd.head()

Unnamed: 0,# ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
1,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,D054198,therapeutic,,,,4519131
71,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,Hyperkinesis,D006948,marker/mechanism,,,,19098162
86,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,Seizures,D012640,marker/mechanism,,,,26348896
135,"10,11-dihydro-10-hydroxycarbamazepine",C039775,,Epilepsy,D004827,therapeutic,,,,17516704
189,"10,11-dihydroxy-N-n-propylnorapomorphine",C425777,,Hyperkinesis,D006948,marker/mechanism,,,,15765258


In [7]:
len(df_cd[df_cd.DirectEvidence =='therapeutic'])

34071

In [6]:
# Create Predicate Column
def cd_predicate(r):
    """
    Create predicate from directevidence if available
    """
    return 'associated_directly_with'

# df_cd['DirectEvidence'] = df_cd.DirectEvidence.astype(str) 
df_cd['Predicate'] = df_cd.apply(cd_predicate, axis=1)
df_cd['Predicate'] = df_cd.Predicate.astype(str)

In [7]:
# #Specify type to optimise
# df_cd['ChemicalID'] = df_cd.ChemicalID.astype(str)
# df_cd['DiseaseID'] = df_cd.DiseaseID.astype(str)

df_cd.DirectEvidence.unique()


array(['therapeutic', 'marker/mechanism'], dtype=object)

In [8]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# RO_0003308 = correlated with condition
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'therapeutic': 'RO_0002212',
    'marker/mechanism': 'RO_0003308' 
}

# Apply map to create ro_predicate value for applicable predicates
df_cd['pred_code'] = df_cd.DirectEvidence.map(map_to_ro).astype(str)

In [9]:
# # Exporting diseaseID and Disease Name for mapping DisGeNet UMLS codes to MESH
# df_cd_mesh = df_cd.loc[:, ('DiseaseName', 'DiseaseID')]
# df_cd_mesh.to_csv('CTDdis-names.csv', index=False)

In [10]:
subj_url = 'http://identifiers.org/ctd.chemical/' 
subj_col = 'ChemicalID'
obj_url = 'http://identifiers.org/mesh/'
obj_url_2 = 'http://identifiers.org/omim/' # to use when CTD gives an omim disease id
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_cd, 'output_cd.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['OMIM:',obj_url_2])

## Gene Disease

In [1]:
# Log progress
subprocess.call('echo "Begin Gene-Dis" >> log.txt', shell=True)

NameError: name 'subprocess' is not defined

In [None]:
# Read in CTD sample, skipping the intro rows
# TODO probably faster to pre-process out rows without direct evidence
df_gd = pd.read_csv('csvs/CTD_genes_diseases.csv', skiprows=27)
df_gd = df_gd.drop(0)
df_gd = df_gd.dropna(subset=['DirectEvidence']) # drop if it doesn't have direct evidence

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
## Create gene-dis file directly-evidenced associations (for use in opa-nn)


df1 = df_gd[(df_gd.DirectEvidence == 'marker/mechanism')]

df1 = df1[['GeneID', 'DiseaseID']]
df1.to_csv('gene-dis-pos-assocs.csv', index=False)

In [None]:
df_gd.head()

In [None]:
df_gd[(df_gd.DirectEvidence == 'marker/mechanism')].shape

In [6]:
#Specify type to optimise
df_gd['GeneID'] = df_gd.GeneID.astype(str)
df_gd['DiseaseID'] = df_gd.DiseaseID.astype(str)

In [7]:
df_gd.head()

Unnamed: 0,# GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs
3323,A,50518.0,Diabetes Mellitus,MESH:D003920,marker/mechanism,,,,1473152
3326,A,50518.0,"Diabetes Mellitus, Type 2",MESH:D003924,marker/mechanism,,,,8146154
3545,A,50518.0,Liver Neoplasms,MESH:D008113,marker/mechanism,,,,15175105
3652,A,50518.0,Neoplasms,MESH:D009369,marker/mechanism,,,,1473152
3702,A,50518.0,Obesity,MESH:D009765,marker/mechanism,,,,1473152|25447408|25448685|8146154


In [6]:
# Must make some quick refinements to ensure resulting URLs work
df_gd['GeneID'] = df_gd.GeneID.astype(float) 
df_gd['GeneID'] = df_gd['GeneID'].fillna(0).astype(int)
df_gd['GeneID'] = df_gd['GeneID'].astype(str)
df_gd['DirectEvidence'] = df_gd.DirectEvidence.astype(str) 
df_gd['DiseaseID'] = df_gd['DiseaseID'].str.replace('MESH:', '')

# Create Predicate Column
def gd_predicate(r):
    """
    Create predicate
    """
    return 'associated_directly_with'
    
df_gd['Predicate'] = df_gd.apply(gd_predicate, axis=1)
df_gd['Predicate'] = df_gd.Predicate.astype(str)

In [7]:
df_gd.head()
df_gd.DirectEvidence.unique()

array(['marker/mechanism', 'therapeutic', 'marker/mechanism|therapeutic'],
      dtype=object)

In [8]:
# Painstakingly split the predicates into the available RO/GO predicates... will do only a
# partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# RO_0002213 = positively regulates
# RO_0002212 = negatively regulates 
# RO_0003308 = correlated with condition
# http://purl.obolibrary.org/obo/RO_0002213

map_to_ro = {
    'therapeutic': 'RO_0002212',
    'marker/mechanism': 'RO_0003308' 
}

# Apply map to create ro_predicate value for applicable predicates
df_gd['pred_code'] = df_gd.DirectEvidence.map(map_to_ro).astype(str)

In [9]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/mesh/' 
obj_url_2 = 'http://identifiers.org/omim/' # to use when CTD gives an omim disease id
obj_col = 'DiseaseID'
pred_col = 'Predicate'

convert_df_nt(df_gd, 'output_gd.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['OMIM:',obj_url_2])

## Gene Pathway

In [205]:
# Log progress
subprocess.call('echo "Begin Gene-Path" >> log.txt', shell=True)

0

In [206]:
# Read in CTD sample, skipping the intro rows
df_gp = pd.read_csv('csvs/CTD_genes_pathways.csv', skiprows=27 )
df_gp = df_gp.drop(0)

In [207]:
# Must make some quick refinements to ensure resulting URLs work
df_gp['GeneID'] = df_gp['GeneID'].fillna(0).astype(int)
df_gp['GeneID'] = df_gp.GeneID.astype(int) 
df_gp['PathwayID'] = df_gp['PathwayID'].str.replace('REACT:', '')

# Create Predicate Column
def gp_predicate(r):
    return 'RO_0002213'
    
df_gp['pred_code'] = df_gp.apply(gp_predicate, axis=1)

In [208]:
#Specify type to optimise
df_gp['GeneID'] = df_gp.GeneID.astype(str)
df_gp['PathwayID'] = df_gp.PathwayID.astype(str)
df_gp['pred_code'] = df_gp.pred_code.astype(str)

In [209]:
df_gp.iloc[80:100]

Unnamed: 0,# GeneSymbol,GeneID,PathwayName,PathwayID,pred_code
81,AAAS,8086,RNA transport,KEGG:hsa03013,RO_0002213
82,AAAS,8086,SLC-mediated transmembrane transport,R-HSA-425407,RO_0002213
83,AAAS,8086,snRNP Assembly,R-HSA-191859,RO_0002213
84,AAAS,8086,SUMO E3 ligases SUMOylate target proteins,R-HSA-3108232,RO_0002213
85,AAAS,8086,SUMOylation,R-HSA-2990846,RO_0002213
86,AAAS,8086,SUMOylation of DNA damage response and repair ...,R-HSA-3108214,RO_0002213
87,AAAS,8086,SUMOylation of DNA replication proteins,R-HSA-4615885,RO_0002213
88,AAAS,8086,SUMOylation of RNA binding proteins,R-HSA-4570464,RO_0002213
89,AAAS,8086,Transcriptional regulation by small RNAs,R-HSA-5578749,RO_0002213
90,AAAS,8086,Transmembrane transport of small molecules,R-HSA-382551,RO_0002213


In [210]:
# # Painstakingly split the predicates into the available RO/GO predicates... will do only a
# # partial job as am in proof of concept stage and Ontology may change [ only clearest effects]
# # RO_0002213 = positively regulates
# # RO_0002212 = negatively regulates 
# # RO_0003308 = correlated with condition
# # http://purl.obolibrary.org/obo/RO_0002213

# map_to_ro = {
#     'therapeutic': 'RO_0002212',
#     'marker/mechanism': 'RO_0003308' 
# }

# # Apply map to create ro_predicate value for applicable predicates
# df_gd['pred_code'] = df_gd.DirectEvidence.map(map_to_ro).astype(str)

In [211]:
subj_url = 'http://identifiers.org/ctd.gene/' 
subj_col = 'GeneID'
obj_url = 'http://identifiers.org/reactome/' 
obj_col = 'PathwayID'
pred_col = 'pred_code'
obj_url_2 = 'http://identifiers.org/kegg.pathway/'

convert_df_nt(df_gp, 'output_gp.nt', subj_url, subj_col, obj_url, obj_col, pred_col, ['KEGG:',obj_url_2])

## Disease Pathway
Commenting out as is inferred

In [212]:
# # Log progress
# subprocess.call('echo "Begin Dis-Path" >> log.txt', shell=True)

In [213]:
# # Read in CTD sample, skipping the intro rows
# df_dp = pd.read_csv('csvs/CTD_diseases_pathways.csv', skiprows=27 )
# df_dp = df_dp.drop(0)

In [214]:
# #Specify type to optimise
# df_dp['DiseaseID'] = df_dp.DiseaseID.astype(str)
# df_dp['PathwayID'] = df_dp.PathwayID.astype(str)
# df_dp['InferenceGeneSymbol'] = df_dp.InferenceGeneSymbol.astype(str)

In [215]:
# # Must make some quick refinements to ensure resulting URLs work
# df_dp['PathwayID'] = df_dp['PathwayID'].str.replace('REACT:', '')
# df_dp['DiseaseID'] = df_dp['DiseaseID'].str.replace('MESH:', '')


# # Create Predicate Column
# def dp_predicate(r):
#     return 'associated_directly_with'
    
# df_dp['Predicate'] = df_dp.apply(dp_predicate, axis=1)
# df_dp['Predicate'] = df_dp.Predicate.astype(str)

In [216]:
# df_dp.head()

In [217]:
# subj_url = 'http://identifiers.org/mesh/' 
# subj_col = 'DiseaseID'
# obj_url = 'http://identifiers.org/reactome/' 
# obj_col = 'PathwayID'
# pred_col = 'Predicate'

# convert_df_nt(df_dp, 'output_dp.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

In [218]:
# df_dp

## Phenotype Chemical
I'm going to comment this section out as this is the Y data and shouldn't be in KG ( I think )

In [219]:
# Read in CTD sample, skipping the intro rows
# df_pc = pd.read_csv('csvs/CTD_pheno_term_ixns.csv', skiprows=27  )
# df_pc = df_pc.drop(0)
# df_pc[10:20]

In [220]:
# Split the interactionActions into separate predicates RUN THIS ONLY ONCE
# s = df_pc['interactionactions'].str.split('|').apply(pd.Series, 1).stack()
# s.index = s.index.droplevel(-1)
# s.name = 'interactionactions'
# df_pc = df_pc.join(s.apply(lambda x: pd.Series(x.split('|'))))
# df_pc = df_pc.rename(columns={0: 'Predicate'})
# df_pc['Predicate'] = df_pc.Predicate.str.replace('^', '_')
# df_pc['Predicate'] = df_pc.Predicate.str.replace(' ', '_')

In [221]:
# subj_url = 'http://identifiers.org/ctd.chemical/'  
# subj_col = 'chemicalid'
# obj_url = 'http://identifiers.org/go/' 
# obj_col = 'phenotypeid'
# pred_col = 'Predicate'

# convert_df_nt(df_pc, 'output_pc.nt', subj_url, subj_col, obj_url, obj_col, pred_col)

## Merge NT files

In [10]:
# Log progress
# import subprocess
subprocess.call('echo "Begin Merging" >> log.txt', shell=True)

0

In [11]:
subprocess.call('cat *.nt > master.nt', shell=True)

0

In [224]:
# Log progress
subprocess.call('echo "Finished Merging" >> log.txt', shell=True)

0

Note that Jena doesn't accept a percentage sign unless it's followed by two hexadecimals, so you can run the following to replace the % sign with the word percentage

In [225]:
# subproces.call("sed -i '/%/ s//percent/g' master.nt", shell=True)

In [226]:
# # You'll need to install Apache Jena for this
# subprocess.call('riot --output=RDFXML master.nt > master.rdf', shell=True)