In [10]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport data.read_data
%aimport models.train_model
%aimport features.build_features
%aimport visualization.visualize
from data.read_data import read_data, get_stopwords
from models.train_model import split_train, score_function, get_fasttext, model_ridge, model_xgb, Ensembler
from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int
from visualization.visualize import plot_roc, plot_scatter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
train = read_data(test=False)
y = train['Target']
stopwords = get_stopwords()
train.head()

Unnamed: 0,ID,review_content,review_title,review_stars,product,Target
0,0,En appelant un acheteur pour demander si l'écr...,La Police s'inscrit en acheteur privé sur Pric...,5,2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...,0
1,1,"Alors, là, on a affaire au plus grand Navet ja...",Chef D'Oeuvre Absolu en vue...,5,7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...,1
2,2,Effet garanti sur la terrase. Ils donnent immé...,Effet garanti sur la terrase. Ils donnent immé...,3,7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...,0
3,3,tres bon rapport qualite prix tre pratique en ...,bon produit,4,77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...,1
4,4,Ordinateur de bureau trés bien pour quelqu'un ...,Apple Power MAC G4,3,f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...,1


In [3]:
# Feature engineering
train = replace_na(train, ['review_content', 'review_title'])
X_dummies = to_categorical(train, 'review_stars')
X_content = to_tfidf(train, 'review_content', stopwords)
X_title = to_tfidf(train, 'review_title', stopwords)
X_length = to_sparse_int(train, 'review_content')

sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length])

In [4]:
model_fasttext = get_fasttext()
xtrain = get_vec(train['review_content'].values, model_fasttext, stopwords)

[10:03:17] INFO loading projection weights from ../data/external/wiki.fr.bin
[10:03:17] DEBUG {'kw': {}, 'mode': 'rb', 'uri': '../data/external/wiki.fr.bin'}
[10:03:17] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='../data/external/wiki.fr.bin'>}
[10:03:46] INFO loaded (1152449, 300) matrix from ../data/external/wiki.fr.bin


from sklearn.preprocessing import OneHotEncoder

lbl_enc = OneHotEncoder(sparse=False)
y_enc = lbl_enc.fit_transform(y.values.reshape(-1,1))

y_enc.shape

In [5]:
X_train_tfv, X_test_tfv, X_train_ft, X_test_ft, y_train, y_test = train_test_split(sparse_merge, xtrain, y, test_size=0.33, random_state=7)

In [6]:
import xgboost as xgb

In [25]:
# specify the data to be used for every level of ensembling:
train_data_dict = {0: [X_train_tfv], 1: [X_train_ft]}
test_data_dict = {0: [X_test_tfv], 1: [X_test_ft]}

model_dict = {0: [xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                                    subsample=0.8, nthread=10, learning_rate=0.1)],

              1: [xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                                    subsample=0.8, nthread=10, learning_rate=0.1)]}

ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification', lower_is_better=True, save_path='')

ens.fit(train_data_dict, y_train, lentrain=X_train_ft.shape[0])
preds = ens.predict(test_data_dict, lentest=X_test_ft.shape[0])

[10:21:46] INFO Found 2 classes
[10:21:46] INFO Training Level 0 Fold # 1. Model # 0
[10:22:06] INFO Predicting Level 0. Fold # 1. Model # 0
[10:22:06] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.704130
[10:22:06] INFO Training Level 0 Fold # 2. Model # 0
[10:22:27] INFO Predicting Level 0. Fold # 2. Model # 0
[10:22:28] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.697470
[10:22:28] INFO Training Level 0 Fold # 3. Model # 0
[10:22:50] INFO Predicting Level 0. Fold # 3. Model # 0
[10:22:50] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.705220
[10:22:50] INFO Level 0. Model # 0. Mean Score = 0.702273. Std Dev = 0.003425
[10:22:50] INFO Saving predictions for level # 0
[10:22:50] INFO Training Level 1 Fold # 1. Model # 0
[10:22:52] INFO Predicting Level 1. Fold # 1. Model # 0
[10:22:53] INFO Level 1. Fold # 1. Model # 0. Validation Score = 0.689030
[10:22:53] INFO Training Level 1 Fold # 2. Model # 0
[10:22:55] INFO Predicting Level 1. Fold # 2. Model # 0


In [28]:
import numpy as np

In [37]:
preds1 = np.mean((preds[0][:,1], preds[1][:,1]),axis=0)

In [38]:
preds1

array([ 0.09356066,  0.27001834,  0.41289055, ...,  0.68325749,
        0.70117071,  0.62061238])

In [39]:
score_function(y_test, preds1)

0.70782701307871121