In [67]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [68]:
%matplotlib inline

In [69]:
datapath = os.path.expanduser('~') + '/meddata/'
dfVar = pd.read_csv(datapath + 'training_variants') #,index_col='ID')

In [70]:
dfVar.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [71]:
file = open(datapath + 'training_text', 'r', encoding = 'utf-8') 
rawText = file.read()

In [72]:
lines = rawText.split('\n')[1:]

splitted = [line.split('||') for line in lines]

dfText = pd.DataFrame(splitted, columns=['ID','Text'])

dfText['ln'] = dfText['ID'].map(lambda x: len(x))
dfText = dfText[dfText['ln'] > 0]
dfText.drop('ln',axis=1,inplace=True)
dfText['ID'] = dfText['ID'].map(lambda n:int(n))


In [73]:
dfText.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [74]:
df = pd.merge(dfText, dfVar, on='ID')
df.set_index('ID',drop=True,inplace=True)

In [75]:
df['hasAbstract'] = df['Text'].map(lambda txt: 
                                   'abstract' in txt.lower())


In [76]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class,hasAbstract
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,False
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,True
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,True
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,False
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,False


In [77]:
df['hasIntroduction'] = df['Text'].map(lambda txt: 
                                   'introduction' in txt.lower())
    

In [78]:
abstracts = []
for index in range(0, df.shape[0]):
    abstracts.append(df.ix[index,'Text'].partition('Introduction')[0])


df['Abstracts'] = pd.Series (abstracts, index=df.index-1)

In [79]:
df.head()

Unnamed: 0_level_0,Text,Gene,Variation,Class,hasAbstract,hasIntroduction,Abstracts
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,False,False,Abstract Background Non-small cell lung canc...
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,True,True,Abstract Background Non-small cell lung canc...
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,True,True,Recent evidence has demonstrated that acquired...
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,False,True,Oncogenic mutations in the monomeric Casitas B...
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,False,True,Oncogenic mutations in the monomeric Casitas B...


In [80]:
def tf_idf(articles):
    tfidf_vectorizer = TfidfVectorizer(min_df = 1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(articles);
    return tfidf_matrix

articles = df['Text']
tfidf_matrix = tf_idf(articles).todense()
print (tfidf_matrix )


[[ 0.          0.01641522  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00691475  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00691475  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.00196218  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00098264  0.         ...,  0.          0.          0.        ]]


In [81]:
print(df['Abstracts'])

ID
0        Abstract Background  Non-small cell lung canc...
1        Abstract Background  Non-small cell lung canc...
2       Recent evidence has demonstrated that acquired...
3       Oncogenic mutations in the monomeric Casitas B...
4       Oncogenic mutations in the monomeric Casitas B...
5       Oncogenic mutations in the monomeric Casitas B...
6       CBL is a negative regulator of activated recep...
7        Abstract Juvenile myelomonocytic leukemia (JM...
8        Abstract Juvenile myelomonocytic leukemia (JM...
9       Oncogenic mutations in the monomeric Casitas B...
10      Noonan syndrome is an autosomal dominant conge...
11      Noonan syndrome is an autosomal dominant conge...
12      Noonan syndrome is an autosomal dominant conge...
13      Oncogenic mutations in the monomeric Casitas B...
14      Noonan syndrome is an autosomal dominant conge...
15      To determine if residual cylindrical refractiv...
16      Acquired uniparental disomy (aUPD) is a common...
17      Onc