In [1]:
# System packages
import os
import sys
import warnings
# Data related
import numpy as np 
import pandas as pd 
import pprint as pp

# sklearn 
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict,cross_val_score,  StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import accuracy_score
import scikitplot.plotters as skplt

# nlp
from gensim.models import KeyedVectors

# models
import xgboost as xgb
import eli5
from eli5.explain import explain_weights
from eli5.formatters import explain_weights_df

warnings.filterwarnings('ignore')

In [None]:
# Add utils_functions.py as a dataset
# Import module 
from shutil import copyfile

# Copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/utils-functions/utils_functions.py", dst = '/kaggle/working/utils_functions.py')
from utils_functions import *

## 1. Load data 

In [None]:
df = pd.read_csv('../input/processed/train_variants_text.csv')
df=df.dropna(subset=['Text'])

### Process data

In [None]:
df['Class'] = df['Class'].astype(int)
df['Gene'] = df['Gene'].astype(str)
df['Variation'] = df['Variation'].astype(str)

y= df['Class']
X= df.drop(['Class','ID'],axis =1)


## 3.1 Bag-of-words 1 Group+xgboost

### Split into Train amd Validation data

In [None]:
X_tr, X_val, y_tr, y_val = split_data(df,
                                      'Text',
                                      'Class',
                                      0.1,
                                      0,
                                      stratify='Class')

In [None]:
clf = Pipeline([('vect', CountVectorizer(preprocessor=clean_text_stemmed, stop_words =stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf',  xgb.XGBClassifier(objective="multi:softprob", random_state=42))])                           

In [None]:
clf.fit(X_tr, y_tr)  
predicted = clf.predict(X_val)
acc=np.mean(predicted == y_val)
print(acc)

In [None]:
X_te = test.Text.values
X_te.shape

In [None]:
y_te = clf.predict_proba(X_te)
y_te.shape

## 3.2 Bag-of-words *3 groups +xgboost

In [None]:
# Select one dataframe column for vectorization
def build_preprocessor(df,field):
    field_idx = list(df.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])
default_preprocessor = CountVectorizer().build_preprocessor()    

In [None]:
vectorizer = FeatureUnion([
    ('Variation',TfidfVectorizer(preprocessor=build_preprocessor(X,'Variation'))),
        ('Gene',TfidfVectorizer(preprocessor=build_preprocessor(X,'Gene'))),
        ('Text',TfidfVectorizer(preprocessor=build_preprocessor(X,'Text'))),
])

In [None]:
X_v = vectorizer.fit_transform(X.values)
X_tr, X_val, y_tr, y_val = train_test_split(X_v,
                                            y,
                                            test_size=0.2,
                                            stratify=y,
                                            random_state=42)

In [None]:
model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
model.fit(X_tr, y_tr)

In [None]:
# Accuracy score 
model.score(X_val,y_val)

In [None]:
df_name = explain_weights_df(model, vec=vectorizer, top=10, feature_filter=lambda x: x != '<BIAS>')           
df_name.to_csv('../data/features/20190609full_union_3groups_tfidf_feature_weights.csv')
df_name.head(10)

In [None]:
# Analyzing individual predictions. Let's check some predictions from the validation set. You see a summary of various vectorizer's contribution at the top, and then below you can see features highlighed in text.
eli5.show_prediction(model, doc=X.values[1], vec=vectorizer)