In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import os.path

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

%matplotlib inline

In [21]:
datapath = os.path.expanduser('~') + '/meddata/'
dfVar = pd.read_csv(datapath + 'training_variants') #,index_col='ID')


In [22]:
dfVar.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [23]:
file = open(datapath + 'training_text', 'r') 
rawText = file.read() 

In [24]:
rawText[:123]

'ID,Text\n0||Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of'

In [25]:
lines = rawText.split('\n')[1:]

splitted = [line.split('||') for line in lines]

dfText = pd.DataFrame(splitted, columns=['ID','Text'])



In [26]:
dfText['ln'] = dfText['ID'].map(lambda x: len(x))
dfText = dfText[dfText['ln'] > 0]
dfText.drop('ln',axis=1,inplace=True)


In [27]:
dfText['ID'] = dfText['ID'].map(lambda n:int(n))

In [28]:
dfText.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [29]:
df = pd.merge(dfText, dfVar, on='ID')
df.set_index('ID',drop=True,inplace=True)

In [30]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4


In [31]:
print(df.ix[1,0][:100])

 Abstract Background  Non-small cell lung cancer (NSCLC) is a heterogeneous group of disorders with 


In [32]:
df['Text'].map(lambda txt: 
                    'abstract' in txt.lower()).sum()

763

In [33]:
def has_string(string,text):
    return string in text.lower()
    

In [34]:
df['Text'].map(lambda text: has_string('abstract',text)).sum() / len(df)

0.22975007527853056

In [35]:
df['Text'].map(lambda text: has_string('conclusion',text)).sum() / len(df)

0.58235471243601322

In [36]:
df['Text'].map(lambda text: has_string('introduction',text)).sum() / len(df)

0.79704908160192711

In [37]:
df['noPunctuation'] = df['Text'].map(lambda str: ''.join(c for c in str 
                if c not in ['.',',',';','(',')']))

In [38]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class,noPunctuation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases CDKs regulate a varie...
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [39]:
tfidf = TfidfVectorizer(stop_words='english').fit(df['Text'].tolist())

In [43]:
df['tfidf'] = df['noPunctuation'].map(lambda s: 
                                    tfidf.transform(s.split(' ')))

In [44]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class,noPunctuation,tfidf
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases CDKs regulate a varie...,"(0, 49831)\t0.437480633105\n (0, 46416)\t0...."
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13..."
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13..."
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,Recent evidence has demonstrated that acquired...,"(0, 122334)\t1.0\n (1, 58304)\t1.0\n (3, 4..."
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,"(0, 105693)\t1.0\n (1, 98728)\t1.0\n (4, 9..."


In [47]:
feature_array = np.array(tfidf.get_feature_names())


In [60]:

def getnames(resp):
    dictio = {}
    for col in resp.nonzero()[1]:
        dictio[feature_array[col]] = resp[0, col]
    l = []
    for k, v in Counter(dictio).most_common(3):
        l.append(k)
    return l

In [63]:
df['bestTfIdf'] = df['tfidf'].map(getnames)

In [64]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class,noPunctuation,tfidf,bestTfIdf
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases CDKs regulate a varie...,"(0, 49831)\t0.437480633105\n (0, 46416)\t0....","[cyclin, dependent, kinases]"
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[abstract, background, small]"
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,"(1, 20965)\t1.0\n (2, 30250)\t1.0\n (4, 13...","[abstract, background, small]"
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,Recent evidence has demonstrated that acquired...,"(0, 122334)\t1.0\n (1, 58304)\t1.0\n (3, 4...","[recent, evidence, demonstrated]"
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,"(0, 105693)\t1.0\n (1, 98728)\t1.0\n (4, 9...","[oncogenic, mutations, monomeric]"
