In [1]:
# System packages
import os
import sys
import warnings
# Data related
import numpy as np 
import pandas as pd 

# sklearn tools 
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, DictVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# sklearn models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

In [3]:
# Import custom functions
from utils_functions import *

Using TensorFlow backend.


## 1. Load full data

In [4]:
df = pd.read_csv('../data/processed/train_variants_text.csv')


### Use a sample data

In [111]:
sample =df.groupby('Class').apply(lambda x: x.sample(frac=0.2))


In [112]:
sample =sample.drop([sample.columns[1], 
                     sample.columns[2], 
                     sample.columns[5]], 
                    axis='columns')
sample.dropna(subset=['Text'])
sample.head(1)
sample.to_csv('../data/processed/train_variants_text_sample.csv')

IndexError: index 5 is out of bounds for axis 0 with size 5

In [113]:
sample = pd.read_csv('../data/processed/train_variants_text_sample.csv')

In [114]:
sample.head()

Unnamed: 0,Class,Gene,Variation,Text
0,1,NF2,L46R,Neurofibromatosis type 2 (NF2) is a multiple n...
1,1,FGFR2,E219K,Introduction Melanoma is the most lethal of a...
2,1,BRCA1,F1704S,Abstract The BRCA1 gene from individuals at ...
3,1,TP53,R337H,The tumor suppressor protein p53 is a transcri...
4,1,TSC2,E75G,Tuberous sclerosis complex (TSC) is an autosom...


### Split sample data into train and validation data set

In [263]:
X_tr, X_val, y_tr, y_val = split_data(sample,
                                      'Text',
                                      'Class',
                                      0.1,
                                      0,
                                      stratify='Class')

In [7]:
# Take a look at the first y_tr and X_tr
#print(y_tr[0], "-is the predicted Class for text -", X_tr[0],)

## 2. Feature extraction

### 2.1 Bag of words
Here we will use 
 * CountVectorizer: Counts the number of times a word appears in the text
 * TfidfVectorizer: Weighs the words according to the importance of the word in the context of whole collection
 

### 2.2 Word2Vec

In [39]:
# Use document df
w2vec = get_word2vec(
    MySentences(
        sample['Text'].values, 
    ),
    'w2vmodel'
)

Found w2vmodel


## 3. Training

## 3.1 Bag of words + Model

### 3.1.1 Naive Bayes classifier for multinomial models
Suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [9]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',  MultinomialNB())])                           

In [10]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)
#print(classification_report(y_val, predicted))

0.44776119402985076


### 3.1.2 SGD 

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning, see the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.

In [16]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  SGDClassifier(n_jobs=))])                           

In [17]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.4925373134328358


### 3.1.3 xgboost

In [261]:
# Without text cleaning
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [264]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.5223880597014925


In [28]:
# With text cleaning
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [29]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.5671641791044776


### 3.1.4 Gene + Variation + xgboost

In [180]:
sample['Class'] = sample['Class'].astype(int)
sample['Gene'] = sample['Gene'].astype(str)
sample['Variation'] = sample['Variation'].astype(str)

y= sample['Class']
X= sample.drop('Class',axis =1)
X_tr, X_val, y_tr, y_val = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [177]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        make_pipeline(
            PandasSelector(["Gene","Variation"]),
            PandasToDict(),
            DictVectorizer(sparse=False)
            # select categorical data
        )
    
    )
)


In [178]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)

In [179]:
model = make_pipeline(processing_pipeline, clf)

In [181]:
model.fit(X_tr, y_tr)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline', Pipeline(memory=None,
     steps=[('pandasselector', PandasSelector(columns=['Gene', 'Variation'])), ('pandastodict', PandasToDict()), ('dictvectorizer', DictVectori...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [182]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

0.42105263157894735


### 3.1.5 TFIDF(Text) + xgboost

In [251]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    )
)
)

In [252]:
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(key='Text')), ('converter', Converter()), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [253]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

0.631578947368421


### 3.1.6 OneHot(Gene) + TFIDF(Text)  + xgboost

In [235]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),    
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [236]:
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(key='Text')), ('converter', Converter()), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', bina...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [237]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

0.6240601503759399


### 3.1.6 OneHot(Variation) + TFIDF(Text) + xgboost

In [247]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(    
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [248]:
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(key='Text')), ('converter', Converter()), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', bina...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [249]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

0.631578947368421


In [254]:
def model_predict(processing_pipeline,clf,X_tr,y_tr,X_val,y_val):
    model = make_pipeline(processing_pipeline, clf)
    model.fit(X_tr, y_tr)
    predicted = model.predict(X_val)
    acc=np.mean(predicted == y_val)
    print(acc)

In [None]:
model_predict(processing_pipeline,clf,X_tr,y_tr,X_val,y_val)

### 3.1.7 OneHot(Variation) + OneHot(Gene) + TFIDF(Text)  + xgboost


In [241]:
processing_pipeline2= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),    
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            OneHotEncoder()
        ),
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [242]:
model2 = make_pipeline(processing_pipeline2, clf)
model2.fit(X_tr, y_tr)

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('itemselector', ItemSelector(key='Text')), ('converter', Converter()), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', bina...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [243]:
predicted2 = model2.predict(X_val)

acc2=np.mean(predicted2 == y_val)
print(acc2)

0.6240601503759399


In [244]:
### Extra

In [207]:
gene_selector = ItemSelector(key='Gene')
df_gene = gene_selector.fit_transform(sample)
df_gene.shape

(664, 1)

## 3.2 Word2vec + Model

### 3.2.1 Document-trained w2vec + xgboost

In [40]:
clf = Pipeline([('vect', MeanEmbeddingVectorizer(w2vec)),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [41]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.4626865671641791
