"""
Author: Ian Coleman
Purpose: Explore CTD data
"""

In [16]:
import pandas as pd
import numpy as np
import scipy as sp

## Functions

In [19]:
def overview_dfs (df_list):
    """
    Input:
        list: List of DFs
    Output:
        Prints info re NaNs, no. rows and cols
    """
    for df in df_list:
        print('\nObservations: ', len(df))
        print('Features: ', len(df.columns))
        
        print('Columns with NaNs:')
        print(df.isnull().any().sum(), ' / ', len(df.columns))

        print('\nRows with NaNs:')
        print(df.isnull().any(axis=1).sum(), ' / ', len(df))

        print('_______________________')

        print('\nData Types:\n')
        print(df.dtypes) # check feat data type

        print('_______________________')

        print('\nData Description:\n')
        print(df.describe())

In [25]:
# Read in CTD sample, skipping the intro rows
df = pd.read_csv('CTD_chem_gene_ixns.csv', skiprows=27, nrows=300)
df = df.drop(0)
df = df.rename(columns={'# ChemicalName': 'ChemicalName'}) # rename of a column

In [26]:
overview_dfs([df])


Observations:  299
Features:  11
Columns with NaNs:
4  /  11

Rows with NaNs:
225  /  299
_______________________

Data Types:

ChemicalName           object
ChemicalID             object
CasRN                  object
GeneSymbol             object
GeneID                float64
GeneForms              object
Organism               object
OrganismID            float64
Interaction            object
InteractionActions     object
PubMedIDs              object
dtype: object
_______________________

Data Description:

              GeneID    OrganismID
count     299.000000    283.000000
mean    11794.715719   9707.816254
std     50146.327888    196.899590
min        43.000000   9606.000000
25%       841.000000   9606.000000
50%      2993.000000   9606.000000
75%      6069.500000   9606.000000
max    440387.000000  10116.000000


In [27]:
df.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
1,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287
2,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287
3,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 analog results in decreased expressio...,decreases^expression,26036281
4,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased activity of MYC ...,decreases^activity,25716159
5,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of MY...,decreases^expression,26036281


In [29]:
# interaction_types = pd.read_csv('CTD_chem_gene_ixn_types.csv')

In [45]:
def convert_df_nt (df, output_file):
    """
    Input:
        DF: some rows and columns of a dataframe
        STR: name for the output file, include filetype .nt
    Output:
        NT file
    """
    # open a text file of name output_file
    # for row in df
        # subject = '<' + url + chemical + '> '
        # predicate = interactionAction + ' '
        # object = '<' + url + gene + '> '
        # write subject + predicate + object + '.'
    f = open(output_file,'w')
    for index, row in df.iterrows():
        subj = '<' + 'http://identifiers.org/ctd.chemical/' + str(row['ChemicalID']) + '> '
        pred = str(row['InteractionActions']) + ' '
        obj = '<' + 'http://identifiers.org/ctd.gene/' + str(int(row['GeneID'])) + '> '
        f.write(subj + pred + obj + '.' + '\n')
#     f.write('hello world')
    f.close()
    

In [46]:
convert_df_nt(df.head(), 'output.nt')