In [3]:
# System packages
import os
import sys
import warnings
# Data related
import numpy as np 
import pandas as pd 
import pprint as pp
# sklearn tools 
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict,cross_val_score,  StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


# sklearn models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

# Model 
import eli5
from eli5.explain import explain_weights
from eli5.formatters import explain_weights_df

In [4]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)


In [5]:
# Add utils_functions.py as a dataset
# Import module 
from shutil import copyfile

# Copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/utils-functions/utils_functions.py", dst = '/kaggle/working/utils_functions.py')
from utils_functions import *

## 1. Load full data

### Use a sample data

In [None]:
'''
sample =df.groupby('Class').apply(lambda x: x.sample(frac=0.2))
sample =sample.drop([sample.columns[1], 
                     sample.columns[2], 
                     sample.columns[5]], 
                    axis='columns')
sample.dropna(subset=['Text'])
sample.head(1)
sample.to_csv('../data/processed/train_variants_text_sample.csv')
'''

In [6]:
sample = pd.read_csv('../input/sample/train_variants_text_sample.csv')

In [7]:
sample.head()

Unnamed: 0,Class,Gene,Variation,Text
0,1,NF2,L46R,Neurofibromatosis type 2 (NF2) is a multiple n...
1,1,FGFR2,E219K,Introduction Melanoma is the most lethal of a...
2,1,BRCA1,F1704S,Abstract The BRCA1 gene from individuals at ...
3,1,TP53,R337H,The tumor suppressor protein p53 is a transcri...
4,1,TSC2,E75G,Tuberous sclerosis complex (TSC) is an autosom...


### Split sample data into train and validation data set

In [None]:
X_tr, X_val, y_tr, y_val = split_data(sample,
                                      'Text',
                                      'Class',
                                      0.1,
                                      0,
                                      stratify='Class')

In [None]:
# Take a look at the first y_tr and X_tr
#print(y_tr[0], "-is the predicted Class for text -", X_tr[0],)

## 2. Feature extraction

### 2.1 Bag of words
Here we will use 
 * CountVectorizer: Counts the number of times a word appears in the text
 * TfidfVectorizer: Weighs the words according to the importance of the word in the context of whole collection
 

### 2.2 Word2Vec

In [None]:
# Use document df
w2vec = get_word2vec(
    MySentences(
        sample['Text'].values, 
    ),
    'w2vmodel'
)

## 2.3  Doc2Vec

In [8]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [None]:
df= sample.

In [9]:
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

In [13]:
sample['Text'] = sample['Text'].apply(cleanText)

In [29]:
from gensim.models import doc2vec

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the complaint narrative.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [16]:
X_tr, X_val, y_tr, y_val = split_data(sample,
                                      'Text',
                                      'Class',
                                      0.1,
                                      0,
                                      stratify='Class')

In [27]:
X_train = label_sentences(X_tr, 'Train')
X_validation = label_sentences(X_val, 'Test')
all_data = X_train + X_validation 
print(len(all_data))
print(all_data[:1])

664
[TaggedDocument(words=['the', 'crucial', 'role', 'of', 'recurrent', 'gene', 'fusions', 'in', 'the', 'development', 'of', 'solid', 'tumors', 'has', 'been', 'recently', 'appreciated', 'after', 'several', 'milestone', 'discoveries1,', '2.', 'in', 'particular,', 'the', 'discovery', 'of', 'an', 'eml4-alk', 'fusion', 'in', '∼4%', 'of', 'lung', 'cancer', 'has', 'led', 'to', 'development', 'of', 'an', 'effective', 'drug', 'with', 'stunning', 'clinical', 'impacts3.', 'recently,', 'net-generation', 'sequencing', '(ngs)', 'has', 'greatly', 'enhanced', 'gene', 'fusion', 'discovery', 'in', 'solid', 'tumors,', 'which', 'has', 'led', 'to', 'the', 'identification', 'of', 'a', 'vti1a-tcf7l2', 'fusion', 'in', '3%', 'of', 'colon', 'cancers4,', 'a', 'bcor-ccnb3', 'fusion', 'in', '4%', 'of', 'bone', 'sarcomas5,', 'and', 'a', 'fgfr-tacc', 'fusion', 'in', '3%', 'of', 'glioblastomas6.', 'although', 'low', 'in', 'percentage,', 'these', 'neoplastic', 'gene', 'fusions', 'will', 'likely', 'advance', 'the', 'g

#### Some issues in the text :
 * 1) fig, figure, s1 
 * 2) number, year, reference index 

### A Doc2Vec model-Distributed Bag of Words (DBOW)

In [34]:
from gensim.models import Doc2Vec
from tqdm import tqdm
from sklearn import utils
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 664/664 [00:00<00:00, 660896.50it/s]


In [35]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 664/664 [00:00<00:00, 350537.18it/s]
100%|██████████| 664/664 [00:00<00:00, 908948.39it/s]
100%|██████████| 664/664 [00:00<00:00, 948252.59it/s]
100%|██████████| 664/664 [00:00<00:00, 1036092.95it/s]
100%|██████████| 664/664 [00:00<00:00, 742869.53it/s]
100%|██████████| 664/664 [00:00<00:00, 695038.15it/s]
100%|██████████| 664/664 [00:00<00:00, 1079464.29it/s]
100%|██████████| 664/664 [00:00<00:00, 598799.80it/s]
100%|██████████| 664/664 [00:00<00:00, 730975.82it/s]
100%|██████████| 664/664 [00:00<00:00, 838860.80it/s]
100%|██████████| 664/664 [00:00<00:00, 1025411.58it/s]
100%|██████████| 664/664 [00:00<00:00, 907171.94it/s]
100%|██████████| 664/664 [00:00<00:00, 927103.15it/s]
100%|██████████| 664/664 [00:00<00:00, 1035707.64it/s]
100%|██████████| 664/664 [00:00<00:00, 926486.31it/s]
100%|██████████| 664/664 [00:00<00:00, 635848.83it/s]
100%|██████████| 664/664 [00:00<00:00, 954099.98it/s]
100%|██████████| 664/664 [00:00<00:00, 656998.79it/s]
100%|██████████| 664/664

CPU times: user 8min 43s, sys: 4.05 s, total: 8min 47s
Wall time: 3min 2s


In [36]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [67]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
val_vectors_dbow = get_vectors(model_dbow, len(X_validation), 300, 'Test')

In [68]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)

In [69]:
clf.fit(train_vectors_dbow, y_tr)  


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [70]:
predicted = clf.predict_proba(val_vectors_dbow)
predicted[0]

array([0.21115369, 0.17018116, 0.00805084, 0.24195004, 0.15148887,
       0.08475184, 0.12203635, 0.00750288, 0.00288434], dtype=float32)

In [71]:
clf.score(val_vectors_dbow, y_val)

0.5074626865671642

### Logistic regression 

In [72]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')
logreg.fit(train_vectors_dbow, y_tr)  
logreg.score(val_vectors_dbow, y_val)

0.5373134328358209

### Distributed Memory

In [73]:
model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dm.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 664/664 [00:00<00:00, 871952.99it/s]


In [74]:
%%time
for epoch in range(30):
    model_dm.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dm.alpha -= 0.002
    model_dm.min_alpha = model_dm.alpha

100%|██████████| 664/664 [00:00<00:00, 546832.49it/s]
100%|██████████| 664/664 [00:00<00:00, 564225.66it/s]
100%|██████████| 664/664 [00:00<00:00, 747254.59it/s]
100%|██████████| 664/664 [00:00<00:00, 690557.37it/s]
100%|██████████| 664/664 [00:00<00:00, 800062.58it/s]
100%|██████████| 664/664 [00:00<00:00, 579849.65it/s]
100%|██████████| 664/664 [00:00<00:00, 674665.18it/s]
100%|██████████| 664/664 [00:00<00:00, 802136.48it/s]
100%|██████████| 664/664 [00:00<00:00, 573875.51it/s]
100%|██████████| 664/664 [00:00<00:00, 794584.27it/s]
100%|██████████| 664/664 [00:00<00:00, 803293.30it/s]
100%|██████████| 664/664 [00:00<00:00, 870862.37it/s]
100%|██████████| 664/664 [00:00<00:00, 636430.04it/s]
100%|██████████| 664/664 [00:00<00:00, 530884.07it/s]
100%|██████████| 664/664 [00:00<00:00, 1020900.97it/s]
100%|██████████| 664/664 [00:00<00:00, 870590.14it/s]
100%|██████████| 664/664 [00:00<00:00, 665953.58it/s]
100%|██████████| 664/664 [00:00<00:00, 959028.19it/s]
100%|██████████| 664/664 [0

CPU times: user 13min 59s, sys: 5.58 s, total: 14min 4s
Wall time: 3min 41s


In [75]:
train_vectors_dm = get_vectors(model_dm, len(X_train), 300, 'Train')
test_vectors_dm = get_vectors(model_dm, len(X_validation), 300, 'Test')

In [76]:
logreg.fit(train_vectors_dm, y_tr)
logreg.score(val_vectors_dbow, y_val)

0.23880597014925373

### Blend 

In [84]:
def get_concat_vectors(model1,model2, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
    return vectors

In [86]:
train_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_train), 600, 'Train')
val_vectors_dbow= get_concat_vectors(model_dbow,model_dm, len(X_validation), 600, 'Test')

In [89]:
%%time
logreg = LogisticRegression()
logreg.fit(train_vecs_dbow_dm, y_tr)


CPU times: user 1.91 s, sys: 0 ns, total: 1.91 s
Wall time: 1.91 s


In [90]:
logreg.score(val_vectors_dbow, y_val)

0.4925373134328358

In [None]:
# model_dm.save('d2v_model_dm.doc2vec')
# model_dm = Doc2Vec.load('d2v_model_dm.doc2vec')
# model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

> ## 3. Training

## 3.1 Bag of words + Model

### 3.1.1 Naive Bayes classifier for multinomial models
Suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [None]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',  MultinomialNB())])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)
#print(classification_report(y_val, predicted))

### 3.1.2 SGD 

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning, see the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.

In [None]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  SGDClassifier(n_jobs=))])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

### 3.1.3 TfidfTransformer + xgboost

In [None]:
# Without text cleaning
clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

In [None]:
# With text cleaning
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

### 3.1.4 DictVectorizer(Gene) +  DictVectorizer(Variation) + xgboost

In [None]:
sample['Class'] = sample['Class'].astype(int)
sample['Gene'] = sample['Gene'].astype(str)
sample['Variation'] = sample['Variation'].astype(str)

y= sample['Class']
X= sample.drop('Class',axis =1)
X_tr, X_val, y_tr, y_val = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        make_pipeline(
            PandasSelector(["Gene","Variation"]),
            PandasToDict(),
            DictVectorizer(sparse=False)
            # select categorical data
        )
    
    )
)


In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)

In [None]:
model = make_pipeline(processing_pipeline, clf)

In [None]:
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

### 3.1.5 TFIDF(Text) + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    )
)
)

In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

### 3.1.6 OneHot(Gene) + TFIDF(Text)  + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),    
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [None]:
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

### 3.1.7 OneHot(Variation) + TFIDF(Text) + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(    
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [None]:
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

In [None]:
def model_predict(processing_pipeline,clf,X_tr,y_tr,X_val,y_val):
    model = make_pipeline(processing_pipeline, clf)
    model.fit(X_tr, y_tr)
    predicted = model.predict(X_val)
    acc=np.mean(predicted == y_val)
    print(acc)

In [None]:
model_predict(processing_pipeline,clf,X_tr,y_tr,X_val,y_val)

### 3.1.8 OneHot(Variation) + OneHot(Gene) + TFIDF(Text)  + xgboost


In [None]:
processing_pipeline2= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),    
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            OneHotEncoder()
        ),
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            OneHotEncoder()
    )
)
)

In [None]:
model2 = make_pipeline(processing_pipeline2, clf)
model2.fit(X_tr, y_tr)

In [None]:
predicted2 = model2.predict(X_val)

acc2=np.mean(predicted2 == y_val)
print(acc2)

### 3.1.9 TFIDF(Gene) + TFIDF(Text)  + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(  
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            TfidfVectorizer()
    )
)
)

In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

### 3.1.10 TFIDF(Variation) +  TFIDF(Gene) +xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(  
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            TfidfVectorizer()
        ),
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            TfidfVectorizer()
    )
   
    )
)


In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)


In [None]:
# prediction
predicted = model.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

In [None]:
print(sample.Gene.nunique())
print(sample.Variation.nunique())

### 3.1.11 TFIDF(Gene) + TFIDF(Text)  + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(
        
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
                    ),    
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            TfidfVectorizer()
    )
)
)

In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)

In [None]:
predicted = model.predict(X_val)

acc=np.mean(predicted == y_val)
print(acc)

### 3.1.11 make_pipeline + make_unionn & TFIDF(Variation) + TFIDF(Gene) + TFIDF(Text) + xgboost

In [None]:
processing_pipeline= make_pipeline(
    # combine features
    make_union(  
        make_pipeline(
            ItemSelector(key="Variation"),
            Converter(),
            TfidfVectorizer()
        ),
        make_pipeline(
            ItemSelector(key="Gene"),
            Converter(),
            TfidfVectorizer()
    ),
        make_pipeline(
            ItemSelector(key='Text'),
            Converter(),
            TfidfVectorizer()
        )
   
    )
)


In [None]:
clf =xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model = make_pipeline(processing_pipeline, clf)
model.fit(X_tr, y_tr)


In [None]:
# prediction
predicted = model.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

### 3.1.12 Pipeline + featureunion + explain _features & TFIDF(Variation) + TFIDF(Gene) + TFIDF(Text) + xgboost

In [None]:
Variation = Pipeline([
    ('selector',ItemSelector(key="Variation")),
    ('converter',Converter()),
    ('tfidf', TfidfVectorizer())
])

Gene = Pipeline([
    ('selector',ItemSelector(key="Gene")),
    ('converter',Converter()),
    ('tfidf', TfidfVectorizer())
])

Text = Pipeline([
    ('selector',ItemSelector(key="Text")),
    ('converter',Converter()),
    ('tfidf', TfidfVectorizer())
    
])

ppl = Pipeline([
    ('features',FeatureUnion([
        ('Variation',Variation),
        ('Gene',Gene),
        ('Text',Text)
    ])),
    ('clf',xgb.XGBClassifier(objective="multi:softprob", random_state=42))
])

In [None]:
# What are the parameters?
pp.pprint(sorted(ppl.get_params().keys()))

In [None]:
# Build a model using default parameters
model = ppl.fit(X_tr,y_tr)
# Accuracy score 
model.score(X_val,y_val)

In [None]:
# Calculate predict probability for each class
y_pred = model.predict_proba(X_val)[0]
y_pred 

# plot the probability 
plt.plot(y_pred)


In [None]:
model.named_steps['features'].transformer_list

In [None]:
f1=model.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].get_feature_names()
f2=model.named_steps['features'].transformer_list[1][1].named_steps['tfidf'].get_feature_names()
f3=model.named_steps['features'].transformer_list[2][1].named_steps['tfidf'].get_feature_names()

In [None]:
list_features=f1
# Add all the elements of f2 and f3 to f1
list_features.extend(f2)
list_features.extend(f3)

In [None]:
list_features

In [None]:
explain_weights(model.named_steps['clf'], 
                 vec=None, top=10, 
                 target_names=ppl.classes_, 
                 feature_names=list_features)


In [None]:
explain_weights(model.named_steps['clf'], 
                 target_names=ppl.classes_, 
                 feature_names=list_features)

In [None]:
model.named_steps

In [None]:
explain_weights()


### Grid search

In [None]:
param_grid = {
    'clf__learning_rate': [0.1,0.01],
    'features__Text__tfidf__lowercase': [True,False]
}

In [None]:
grid = GridSearchCV(ppl, param_grid, scoring='neg_mean_absolute_error')
grid.fit(X_tr, y_tr)
print(grid.best_params_)

## Summary, the appropriate way
1. Drop NA, define dtype 
2. Feature extraction and Union
3. Define model Pipeline
4. model = Pipeline.fit(X,y)
5. model.score(X_val,y_val)
6. explain weights


### 1. Fill NA, define dtype 

In [None]:
sample['Class'] = sample['Class'].astype(int)
sample['Gene'] = sample['Gene'].astype(str)
sample['Variation'] = sample['Variation'].astype(str)

y= sample['Class']
X= sample.drop('Class',axis =1)
X_tr, X_val, y_tr, y_val = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
# 2. Feature extraction and Union
def build_preprocessor(df,field):
    field_idx = list(df.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])
    
default_preprocessor = CountVectorizer().build_preprocessor()


### 2. Feature extraction and Union

In [None]:
vectorizer = FeatureUnion([
    ('Variation',TfidfVectorizer(preprocessor=build_preprocessor(X,'Variation'))),
        ('Gene',TfidfVectorizer(preprocessor=build_preprocessor(X,'Gene'))),
        ('Text',TfidfVectorizer(preprocessor=build_preprocessor(X,'Text'))),
])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

In [None]:
eli5.show_weights(model, vec=vectorizer)

In [None]:
eli5.show_weights(model, vec=vectorizer, top=10, feature_filter=lambda x: x != '<BIAS>')

In [None]:
df_name = explain_weights_df(model, vec=vectorizer, top=10, feature_filter=lambda x: x != '<BIAS>')
                
df_name.to_csv('../data/features/20190609sample_union_3groups_tfidf_feature_weights.csv')

In [None]:
# Analyzing individual predictions. Let's check some predictions from the validation set. You see a summary of various vectorizer's contribution at the top, and then below you can see features highlighed in text.
eli5.show_prediction(model, doc=X.values[1], vec=vectorizer)


In [None]:
eli5.show_weights(model)

#### 2.2 Two vectors  Gene +Text

In [None]:
vectorizer = FeatureUnion([
        ('Gene',TfidfVectorizer(preprocessor=build_preprocessor(X,'Gene'))),
        ('Text',TfidfVectorizer(preprocessor=build_preprocessor(X,'Text'))),
])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

#### 2.3 Two vectors  Variation +Text

In [None]:
vectorizer = FeatureUnion([
    ('Variation',TfidfVectorizer(preprocessor=build_preprocessor(X,'Variation'))),
        ('Text',TfidfVectorizer(preprocessor=build_preprocessor(X,'Text'))),
])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

#### 2.4 One vector  Text

In [None]:
vectorizer = FeatureUnion([
        ('Text',TfidfVectorizer(preprocessor=build_preprocessor(X,'Text')))
])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

#### 2.5 Gene and Variation Vector

In [None]:
vectorizer = FeatureUnion([
    ('Variation',TfidfVectorizer(preprocessor=build_preprocessor(X,'Variation'))),
        ('Gene',TfidfVectorizer(preprocessor=build_preprocessor(X,'Gene')))

])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

### Cross validation 

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Performing K fold Cross Validation to get an estimate of model performance on unknown data.
print('XGBoost Mean score: {}'.format(cross_val_score(estimator=model, 
                                                                  X=X_tr, y= y_tr,
                                                                  cv=3, n_jobs = -1).mean()))

### Grid search

In [None]:
# What are the parameters?
pp.pprint(sorted(model.get_params().keys()))

In [None]:
param_grid = {
    'learning_rate': [0.1,0.01],
    'max_depth': [3,6]grid = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error')
}
grid = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error')
grid.fit(X_tr, y_tr)
print(grid.best_params_)

## 3.2 Word2vec + Model

### 3.2.1 Document-trained w2vec + xgboost

In [None]:
clf = Pipeline([('vect', MeanEmbeddingVectorizer(w2vec)),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

### Lessons learned

 * Text pre-processing helps 
 * W2vec: domain corpus helps 
 * A small document: w2vec may not outperform tfidf
 * One-hot encodings is not good for tree based methods 
 * Knowing what featurews are useful/useless for prediction is helpful 
