In [251]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [252]:
%matplotlib inline

In [253]:
datapath = os.path.expanduser('~') + '/meddata/'
dfVar_training = pd.read_csv(datapath + 'training_variants') #,index_col='ID')
dfVar_testing = pd.read_csv(datapath + 'test_variants')


In [254]:
dfVar_training.head()


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [255]:
dfVar_testing.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [256]:
def prepare_data(path,dfVar):
    file = open(datapath + path , 'r', encoding = 'utf-8') 
    rawText = file.read()

    lines = rawText.split('\n')[1:]

    splitted = [line.split('||') for line in lines]

    dfText = pd.DataFrame(splitted, columns=['ID','Text'])
    dfText['ln'] = dfText['ID'].map(lambda x: len(x))
    dfText = dfText[dfText['ln'] > 0]
    dfText.drop('ln',axis=1,inplace=True)
    dfText['ID'] = dfText['ID'].map(lambda n:int(n))
    
    df = pd.merge(dfText, dfVar, on='ID')
    #df.set_index('ID',drop=True,inplace=True)
    
    df['hasAbstract'] = df['Text'].map(lambda txt: 
                                   'abstract' in txt.lower())
    df['hasIntroduction'] = df['Text'].map(lambda txt: 
                                   'introduction' in txt.lower())

    return df

In [257]:
def add_abstracts(df):
    abstracts = []
    for index in range(0, df.shape[0]):
        abstracts.append(df.ix[index,'Text'].partition('Introduction')[0])


    df['Abstracts'] = pd.Series (abstracts, index=df.index)

In [258]:
def count_abstracts_conclusions():
    counter = 0
    index = 0
    articles = df['Text']

    for article in articles:
        article = article.lower()
        index +=1
        if 'introduction' not in article and 'abstract' not in article and 'conclusion' not in article and 'conclusions' not in article:
                counter +=1
                #print ('Found article without abstract or conclusion:',index)
    print ('Total number of articles wihtout any abstract or conclusion:',counter)

In [259]:
def get_genes_count(df):
    #gene -> where the mutation is located
    s_gene = df.ix[:,2]
    print (s_gene)
    counts_gene = s_gene.value_counts()
    print ("*******************Gene occurrence*******************")
    print (counts_gene)
    print ("*******************Types of genes********************")
    print (len(counts_gene))
    

In [260]:
def get_variations_count(df):
    #variations -> aminoacid change for mutation
    s_variations = df.ix[:,3]
    print (s_variations)
    counts_variations = s_variations.value_counts()
    print ("*******************Gene occurrence*******************")
    print (counts_variations)
    print ("*******************Types of variations********************")
    print (len(counts_variations))

In [261]:
df_train = prepare_data('training_text',dfVar_training)
add_abstracts(df_train)
df_train.head()


Unnamed: 0,ID,Text,Gene,Variation,Class,hasAbstract,hasIntroduction,Abstracts
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1,False,False,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2,True,True,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2,True,True,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3,False,True,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4,False,True,Oncogenic mutations in the monomeric Casitas B...


In [262]:
get_genes_count(df_train)
get_genes_count(df_train)

0       FAM58A
1          CBL
2          CBL
3          CBL
4          CBL
5          CBL
6          CBL
7          CBL
8          CBL
9          CBL
10         CBL
11         CBL
12         CBL
13         CBL
14         CBL
15         CBL
16         CBL
17         CBL
18         CBL
19         CBL
20         CBL
21         CBL
22         CBL
23         CBL
24         CBL
25         CBL
26       SHOC2
27        TERT
28        TERT
29        TERT
         ...  
3291       RET
3292       RET
3293       RET
3294       RET
3295       RET
3296       RET
3297     RUNX1
3298     RUNX1
3299     RUNX1
3300     RUNX1
3301     RUNX1
3302     RUNX1
3303     RUNX1
3304     RUNX1
3305     RUNX1
3306     RUNX1
3307     RUNX1
3308     RUNX1
3309     RUNX1
3310     RUNX1
3311     RUNX1
3312     RUNX1
3313     RUNX1
3314     RUNX1
3315     RUNX1
3316     RUNX1
3317     RUNX1
3318     RUNX1
3319     RUNX1
3320     RUNX1
Name: Gene, dtype: object
*******************Gene occurrence*******************
BRCA1

In [263]:
df_test = prepare_data('training_text',dfVar_testing)

add_abstracts(df_test)
df_test.head()


Unnamed: 0,ID,Text,Gene,Variation,hasAbstract,hasIntroduction,Abstracts
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,ACSL4,R570S,False,False,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...,NAGLU,P521L,True,True,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...,PAH,L333F,True,True,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...,ING1,A148D,False,True,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...,TMEM216,G77A,False,True,Oncogenic mutations in the monomeric Casitas B...


In [264]:
get_genes_count(df_test)
get_genes_count(df_test)

0         ACSL4
1         NAGLU
2           PAH
3          ING1
4       TMEM216
5        CD40LG
6         KLF11
7          SGCB
8         CLCF1
9        SDHAF1
10       SPTLC2
11        SUMF1
12         TET2
13         G6PD
14         SNCB
15        EFNB1
16        PKHD1
17           F8
18         PGK1
19         MTOR
20         KRT2
21          KIT
22         PTEN
23       ABCA12
24       HSD3B7
25         TBX5
26        CSRP3
27         CASR
28        EFNB1
29        ACADS
         ...   
3291        GBA
3292      FBLN5
3293       ARSA
3294     ABCA12
3295       NAGS
3296     SCNN1A
3297      TULP1
3298       DSG4
3299       ARSA
3300      STK11
3301        HFE
3302      CDH23
3303      SUMF1
3304        F11
3305       HEXA
3306        SMO
3307     HGSNAT
3308        GAA
3309       MEFV
3310       ASS1
3311       IDUA
3312       WNT4
3313       TP53
3314         F8
3315     SPTBN2
3316      SUGCT
3317       WFS1
3318    ALDH3A2
3319     MAN2B1
3320       FBN2
Name: Gene, dtype: objec

In [265]:
def tf_idf(articles):
    tfidf_vectorizer = TfidfVectorizer(min_df = 1)
    tfidf_matrix = tfidf_vectorizer.fit_transform(articles);
    return tfidf_matrix

#tfidf_matrix = tf_idf(articles).todense()
#print (tfidf_matrix)