In [34]:
# System packages
import os
import sys
import warnings
# Data related
import numpy as np 
import pandas as pd 

# sklearn tools 
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

# sklearn models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [35]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

In [36]:
# Import custom functions
from utils_functions import *

## 1. Load data

In [5]:
df = pd.read_csv('../data/processed/train_variants_text.csv')


### Use a sample data

In [6]:
sample =df.groupby('Class').apply(lambda x: x.sample(frac=0.2))
sample.head(1)
sample.to_csv('../data/processed/train_variants_text_sample.csv')

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Gene,Variation,Class,Text
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2455,2455,BRCA1,H1746N,1,Abstract The BRCA1 gene from individuals at ...


In [37]:
sample = pd.read_csv('../data/processed/train_variants_text_sample.csv')

### Split sample data into train and validation data set

In [38]:
X_tr, X_val, y_tr, y_val = split_data(sample,
                                      'Text',
                                      'Class',
                                      0.1,
                                      0,
                                      stratify='Class')

In [7]:
# Take a look at the first y_tr and X_tr
#print(y_tr[0], "-is the predicted Class for text -", X_tr[0],)

## 2. Feature extraction

### 2.1 Bag of words
Here we will use 
 * CountVectorizer: Counts the number of times a word appears in the text
 * TfidfVectorizer: Weighs the words according to the importance of the word in the context of whole collection
 

### 2.2 Word2Vec

In [39]:
# Use document df
w2vec = get_word2vec(
    MySentences(
        sample['Text'].values, 
    ),
    'w2vmodel'
)

Found w2vmodel


## 3. Training

## 3.1 Bag of words + Model

### 3.1.1 Naive Bayes classifier for multinomial models
Suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [9]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',  MultinomialNB())])                           

In [10]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)
#print(classification_report(y_val, predicted))

0.44776119402985076


### 3.1.2 SGD 

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning, see the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.

In [16]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  SGDClassifier(n_jobs=))])                           

In [17]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.4925373134328358


### 3.1.3 xgboost

In [28]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [29]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.5671641791044776


## 3.2 Word2vec + Model

### 3.2.1 Document-trained w2vec + xgboost

In [40]:
clf = Pipeline([('vect', MeanEmbeddingVectorizer(w2vec)),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [41]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

0.4626865671641791
