In [124]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer


In [125]:
%matplotlib inline

In [126]:
datapath = os.path.expanduser('~') + '/meddata/'
dfVar = pd.read_csv(datapath + 'training_variants') #,index_col='ID')

In [127]:
dfVar.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [128]:
file = open(datapath + 'training_text', 'r', encoding = 'utf-8') 
rawText = file.read()

In [129]:
lines = rawText.split('\n')[1:]

splitted = [line.split('||') for line in lines]

dfText = pd.DataFrame(splitted, columns=['ID','Text'])

dfText['ln'] = dfText['ID'].map(lambda x: len(x))
dfText = dfText[dfText['ln'] > 0]
dfText.drop('ln',axis=1,inplace=True)
dfText['ID'] = dfText['ID'].map(lambda n:int(n))


In [130]:
dfText.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [131]:
df = pd.merge(dfText, dfVar, on='ID')
#df.set_index('ID',drop=True,inplace=True)
 

In [132]:
df['hasAbstract'] = df['Text'].map(lambda txt: 
                                   'abstract' in txt.lower())

In [133]:
df.head()

Unnamed: 0,ID,Text,Gene,Variation,Class,hasAbstract
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,False
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,True
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,True
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,False
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,False


In [134]:
df['hasIntroduction'] = df['Text'].map(lambda txt: 
                                   'introduction' in txt.lower())
    

In [135]:
abstracts = []
for index in range(0, df.shape[0]):
    abstracts.append(df.ix[index,'Text'].partition('Introduction')[0])


df['Abstracts'] = pd.Series (abstracts, index=df.index-1)

In [136]:
df.head()

Unnamed: 0,ID,Text,Gene,Variation,Class,hasAbstract,hasIntroduction,Abstracts
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,False,False,Abstract Background Non-small cell lung canc...
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,True,True,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,True,True,Recent evidence has demonstrated that acquired...
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,False,True,Oncogenic mutations in the monomeric Casitas B...
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,False,True,Oncogenic mutations in the monomeric Casitas B...


In [137]:
counter = 0
index = 0
articles = df['Text']

for article in articles:
    article = article.lower()
    index +=1
    if 'introduction' not in article and 'abstract' not in article and 'conclusion' not in article and 'conclusions' not in article:
            counter +=1
            print ('Found article without abstract or conclusion:',index)
print ('Total number of articles wihtout any abstract or conclusion:',counter)

Found article without abstract or conclusion: 1
Found article without abstract or conclusion: 16
Found article without abstract or conclusion: 18
Found article without abstract or conclusion: 20
Found article without abstract or conclusion: 38
Found article without abstract or conclusion: 39
Found article without abstract or conclusion: 43
Found article without abstract or conclusion: 45
Found article without abstract or conclusion: 50
Found article without abstract or conclusion: 62
Found article without abstract or conclusion: 72
Found article without abstract or conclusion: 76
Found article without abstract or conclusion: 90
Found article without abstract or conclusion: 92
Found article without abstract or conclusion: 93
Found article without abstract or conclusion: 94
Found article without abstract or conclusion: 99
Found article without abstract or conclusion: 141
Found article without abstract or conclusion: 146
Found article without abstract or conclusion: 165
Found article with

In [138]:
def tf_idf(articles):
    tfidf_vectorizer = TfidfVectorizer(min_df = 1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(articles);
    return tfidf_matrix

tfidf_matrix = tf_idf(articles).todense()
print (tfidf_matrix )

[[ 0.          0.01641522  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00691475  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00691475  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.00196218  0.         ...,  0.          0.          0.        ]
 [ 0.          0.00098264  0.         ...,  0.          0.          0.        ]]
