### Quick Notes

https://github.com/brandomr/document_cluster

http://www.nltk.org/book/

stemming nltk

In [2]:
import pandas as pd
import numpy as np

train_text = pd.read_csv('../data/training_text', 
                         sep = "\|\|", 
                         engine = 'python', 
                         header = None, 
                         skiprows = 1, 
                         names = ["ID","Text"])
test_text = pd.read_csv('../data/test_text', 
                        sep = "\|\|", 
                        engine = 'python',
                        header = None, 
                        skiprows = 1, 
                        names = ["ID","Text"])

df_train = pd.read_csv('../data/training_variants')
df_test = pd.read_csv('../data/test_variants')

df_train['Text'] = train_text.Text
df_test['Text'] = test_text.Text

df_train.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [5]:
len(df_train)

3321

In [7]:
len(df_train.Gene.unique())

264

In [8]:
len(df_train.Variation.unique())

2996

In [10]:
df_train.Gene.value_counts()

BRCA1       264
TP53        163
EGFR        141
PTEN        126
BRCA2       125
KIT          99
BRAF         93
ALK          69
ERBB2        69
PDGFRA       60
PIK3CA       56
CDKN2A       52
FGFR2        50
FLT3         49
TSC2         47
MTOR         45
KRAS         44
MAP2K1       43
VHL          41
RET          40
FGFR3        39
MLH1         35
MET          33
SMAD4        33
JAK2         33
NOTCH1       31
AKT1         28
PTPN11       26
ABL1         26
ROS1         26
           ... 
KMT2B         1
HLA-B         1
HIST1H1C      1
GLI1          1
AXL           1
TCF3          1
FLT1          1
RARA          1
RNF43         1
GNA11         1
DUSP4         1
FANCC         1
CEBPA         1
SDHC          1
FGF19         1
CDKN2C        1
ERRFI1        1
WHSC1L1       1
INPP4B        1
SHOC2         1
IKBKE         1
NCOR1         1
MEN1          1
BARD1         1
EPCAM         1
SDHB          1
FUBP1         1
SRSF2         1
LATS1         1
FOXO1         1
Name: Gene, dtype: int64

In [11]:
df_train.Variation.value_counts()

Truncating Mutations         93
Deletion                     74
Amplification                71
Fusions                      34
Overexpression                6
G12V                          4
Q61H                          3
Q61L                          3
T58I                          3
Q61R                          3
E17K                          3
P130S                         2
Promoter Hypermethylation     2
G12A                          2
A146T                         2
ETV6-NTRK3 Fusion             2
G67R                          2
F384L                         2
G13C                          2
G35R                          2
R170W                         2
C618R                         2
Q61K                          2
Y64A                          2
M1R                           2
K117N                         2
Q22K                          2
T286A                         2
G12D                          2
G13D                          2
                             ..
D513Y   

In [12]:
df_train.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [13]:
df_train.isnull().sum()

ID           0
Gene         0
Variation    0
Class        0
Text         0
dtype: int64

In [3]:
import re, string
from nltk.corpus import stopwords

def process_text(txt):

    txt = txt.lower()           
    txt = txt.strip()      
    #Remove stopwords
    txt = " ".join([word for word in txt.split(' ') if word not in stopwords.words('english')])
    #Remove punctuation
    txt = "".join(l for l in txt if l not in string.punctuation)   
    return txt

df_train['Text'] = df_train['Text'].map(lambda x: process_text(x))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = df_train['Text'].values
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
text_vectorized = vectorizer.fit_transform(text)
print(text_vectorized.shape)

(3321, 3277863)


In [9]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

y = df_train.Class.values
scores_rfc = cross_val_score(RandomForestClassifier(), 
                             text_vectorized, 
                             y, 
                             cv = 5, 
                             scoring = 'log_loss')
scores_rfc

array([-7.19773128, -8.37886377, -8.86677507, -5.44876663, -6.9950215 ])