In [85]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter


In [86]:
%matplotlib inline

In [87]:
datapath = os.path.expanduser('~') + '/meddata/'
dfVar_training = pd.read_csv(datapath + 'training_variants') #,index_col='ID')
dfVar_testing = pd.read_csv(datapath + 'test_variants')


In [88]:
dfVar_training.head()


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [89]:
dfVar_testing.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [90]:
def prepare_data(path,dfVar):
    file = open(datapath + path , 'r', encoding = 'utf-8') 
    rawText = file.read()

    lines = rawText.split('\n')[1:]

    splitted = [line.split('||') for line in lines]

    dfText = pd.DataFrame(splitted, columns=['ID','Text'])
    dfText['ln'] = dfText['ID'].map(lambda x: len(x))
    dfText = dfText[dfText['ln'] > 0]
    dfText.drop('ln',axis=1,inplace=True)
    dfText['ID'] = dfText['ID'].map(lambda n:int(n))
    
    df = pd.merge(dfText, dfVar, on='ID')
    #df.set_index('ID',drop=True,inplace=True)
    
    df['hasAbstract'] = df['Text'].map(lambda txt: 
                                   'abstract' in txt.lower())
    df['hasIntroduction'] = df['Text'].map(lambda txt: 
                                   'introduction' in txt.lower())

    return df

In [91]:
def add_abstracts(df):
    abstracts = []
    for index in range(0, df.shape[0]):
        abstracts.append(df.ix[index,'Text'].partition('Introduction')[0])


    df['Abstract'] = pd.Series (abstracts, index=df.index)
    


In [92]:
def count_abstracts_conclusions():
    counter = 0
    index = 0
    articles = df['Text']

    for article in articles:
        article = article.lower()
        index +=1
        if 'introduction' not in article and 'abstract' not in article and 'conclusion' not in article and 'conclusions' not in article:
                counter +=1
                #print ('Found article without abstract or conclusion:',index)
    print ('Total number of articles wihtout any abstract or conclusion:',counter)

In [93]:
def get_genes_count(df):
    #gene -> where the mutation is located
    s_gene = df.ix[:,2]
    print (s_gene)
    counts_gene = s_gene.value_counts()
    return counts_gene, len(counts_gene)
    

In [94]:
def get_variations_count(df):
    #variations -> aminoacid change for mutation
    s_variations = df.ix[:,3]
    print (s_variations)
    counts_variations = s_variations.value_counts()
    
    return counts_variations, len(counts_variations)
  

In [95]:
df_train = prepare_data('training_text',dfVar_training)
add_abstracts(df_train)
df_train =df_train[['ID','Text','Gene','Variation','hasAbstract','hasIntroduction','Abstract','Class']]
df_train.head()


Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstract,Class
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,False,False,Cyclin-dependent kinases (CDKs) regulate a var...,1
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,True,True,Abstract Background Non-small cell lung canc...,2
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,True,True,Abstract Background Non-small cell lung canc...,2
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,False,True,Recent evidence has demonstrated that acquired...,3
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,False,True,Oncogenic mutations in the monomeric Casitas B...,4


In [96]:
df_test = prepare_data('training_text',dfVar_testing)
df_train =df_train[['ID','Text','Gene','Variation','hasAbstract','hasIntroduction','Abstract']]

add_abstracts(df_test)
df_test.head()


Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstract
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,ACSL4,R570S,False,False,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...,NAGLU,P521L,True,True,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...,PAH,L333F,True,True,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...,ING1,A148D,False,True,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...,TMEM216,G77A,False,True,Oncogenic mutations in the monomeric Casitas B...


In [97]:
df_train['noPunctuation'] = df_train['Text'].map(lambda str: ''.join(c for c in str 
                if c not in ['.',',',';','(',')']))

df_test['noPunctuation'] = df_train['Text'].map(lambda str: ''.join(c for c in str 
                if c not in ['.',',',';','(',')']))
df_train.head()

Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstract,noPunctuation
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,False,False,Cyclin-dependent kinases (CDKs) regulate a var...,Cyclin-dependent kinases CDKs regulate a varie...
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,False,True,Recent evidence has demonstrated that acquired...,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,False,True,Oncogenic mutations in the monomeric Casitas B...,Oncogenic mutations in the monomeric Casitas B...


In [98]:
tfidf = TfidfVectorizer(stop_words='english').fit(df_train['Text'].tolist())
df_train['tfidf'] = df_train['noPunctuation'].map(lambda s: 
                                    tfidf.transform(s.split(' ')))

In [99]:
feature_array = np.array(tfidf.get_feature_names())

In [100]:
def getnames(resp):
    dictio = {}
    for col in resp.nonzero()[1]:
        dictio[feature_array[col]] = resp[0, col]
    l = []
    for k, v in Counter(dictio).most_common(3):
        l.append(k)
    return l

In [101]:
df_train['bestTfIdf'] = df_train['tfidf'].map(getnames)
df_train.head()

Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstract,noPunctuation,tfidf,bestTfIdf
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,False,False,Cyclin-dependent kinases (CDKs) regulate a var...,Cyclin-dependent kinases CDKs regulate a varie...,"(0, 49831)\t0.437480633105\n (0, 46416)\t0....","[cyclin, dependent, ets2]"
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[cytoskeleton, results, snp]"
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[cytoskeleton, results, snp]"
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,False,True,Recent evidence has demonstrated that acquired...,Recent evidence has demonstrated that acquired...,"(0, 122334)\t1.0\n (1, 58304)\t1.0\n (3, 4...","[recent, results, snp]"
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,False,True,Oncogenic mutations in the monomeric Casitas B...,Oncogenic mutations in the monomeric Casitas B...,"(0, 105693)\t1.0\n (1, 98728)\t1.0\n (4, 9...","[oncogenic, boltzmann, ratiometric]"


In [102]:
tfidf_test = TfidfVectorizer(stop_words='english').fit(df_test['Text'].tolist())
df_test['tfidf'] = df_test['noPunctuation'].map(lambda s: 
                                    tfidf.transform(s.split(' ')))
feature_array = np.array(tfidf_test.get_feature_names())
df_test['bestTfIdf'] = df_test['tfidf'].map(getnames)
df_test.head()

Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstract,noPunctuation,tfidf,bestTfIdf
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,ACSL4,R570S,False,False,Cyclin-dependent kinases (CDKs) regulate a var...,Cyclin-dependent kinases CDKs regulate a varie...,"(0, 49831)\t0.437480633105\n (0, 46416)\t0....","[cyclin, dependent, ets2]"
1,1,Abstract Background Non-small cell lung canc...,NAGLU,P521L,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[cytoskeleton, results, snp]"
2,2,Abstract Background Non-small cell lung canc...,PAH,L333F,True,True,Abstract Background Non-small cell lung canc...,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[cytoskeleton, results, snp]"
3,3,Recent evidence has demonstrated that acquired...,ING1,A148D,False,True,Recent evidence has demonstrated that acquired...,Recent evidence has demonstrated that acquired...,"(0, 122334)\t1.0\n (1, 58304)\t1.0\n (3, 4...","[recent, results, snp]"
4,4,Oncogenic mutations in the monomeric Casitas B...,TMEM216,G77A,False,True,Oncogenic mutations in the monomeric Casitas B...,Oncogenic mutations in the monomeric Casitas B...,"(0, 105693)\t1.0\n (1, 98728)\t1.0\n (4, 9...","[oncogenic, boltzmann, ratiometric]"


In [103]:
# # x_raw= df_train.ix[:,6].apply(lambda x: clean_str(x).split(' ')).tolist()
# df_train['words'] = pd.Series (x_raw, index=df_train.index)

# pattern_line = "[a-zA-Z0-9]-[a-zA-Z0-9]*"
# pattern_caps = "[A-Z]+[A-Z0-9]+"
# matches = []
# for abstract in x_raw:
#     s = str(abstract).splitlines()
#     line = str(s).strip()
#     matches.append (re.findall(pattern_line, line))
# print (matches[0])