In [56]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
import numpy as np
import pandas as pd
import gc

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport data.read_data
%aimport models.train_model
%aimport models.predict_model
%aimport features.build_features
%aimport visualization.visualize
from data.read_data import read_data, get_stopwords
from models.train_model import split_train, score_function, model_ridge, model_xgb, model_ensembler
from models.predict_model import predict_test
from features.build_features import feature_extraction
from visualization.visualize import plot_roc, plot_scatter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
train, test = read_data(test=True)
nrow_train = train.shape[0]
y = train['Target']
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test['ID']

del train
del test
gc.collect()

stopwords = get_stopwords()

In [5]:
merge_sparse, merge_ft = feature_extraction(merge, stopwords)

[23:43:23] INFO loading projection weights from ../data/external/wiki.fr.bin
[23:43:23] DEBUG {'kw': {}, 'mode': 'rb', 'uri': '../data/external/wiki.fr.bin'}
[23:43:23] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='../data/external/wiki.fr.bin'>}
[23:43:45] INFO loaded (1152449, 300) matrix from ../data/external/wiki.fr.bin


In [6]:
X_sparse = merge_sparse[:nrow_train]
X_ft = merge_ft[:nrow_train]
X_test_sparse = merge_sparse[nrow_train:]
X_test_ft = merge_ft[nrow_train:]

In [7]:
ens = model_ensembler(X_sparse, X_ft, y)

[23:45:00] INFO Found 2 classes
[23:45:00] INFO Training Level 0 Fold # 1. Model # 0
[23:47:19] INFO Predicting Level 0. Fold # 1. Model # 0
[23:47:20] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.718949
[23:47:20] INFO Training Level 0 Fold # 2. Model # 0
[23:49:37] INFO Predicting Level 0. Fold # 2. Model # 0
[23:49:38] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.716949
[23:49:38] INFO Training Level 0 Fold # 3. Model # 0
[23:51:50] INFO Predicting Level 0. Fold # 3. Model # 0
[23:51:51] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.714180
[23:51:51] INFO Level 0. Model # 0. Mean Score = 0.716693. Std Dev = 0.001955
[23:51:51] INFO Training Level 0 Fold # 1. Model # 1
[23:52:31] INFO Predicting Level 0. Fold # 1. Model # 1
[23:52:34] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.714212
[23:52:34] INFO Training Level 0 Fold # 2. Model # 1
[23:53:14] INFO Predicting Level 0. Fold # 2. Model # 1
[23:53:19] INFO Level 0. Fold # 2. Model # 1. Val

In [8]:
test_data_dict = {0: [X_test_sparse, X_test_sparse], 1: [X_test_ft]}
preds = ens.predict(test_data_dict, lentest=X_test_ft.shape[0])
preds1 = np.mean((preds[0][:,1], preds[1][:,1]),axis=0)

[nltk_data] Downloading package punkt to /home/cris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[23:54:21] INFO Training Fulldata Level 0. Model # 0
[23:57:12] INFO Predicting Test Level 0. Model # 0
[23:57:13] INFO Training Fulldata Level 0. Model # 1
[23:58:12] INFO Predicting Test Level 0. Model # 1
[23:58:16] INFO Training Fulldata Level 1. Model # 0
[23:58:21] INFO Predicting Test Level 1. Model # 0


In [18]:
preds1

array([ 0.85922089,  0.00883833,  0.23765411, ...,  0.15293477,
        0.22380195,  0.69110957])

In [None]:
predict_test(submission,preds1)

In [58]:
df = pd.read_csv('../models/submission_18-01-07.csv', sep=';')

In [61]:
df['Target'] = df['Target'].apply(lambda x: 1 if x > 0.5 else 0)

In [62]:
df.to_csv('../models/submission_1.csv',                                                                                                        
                      index=False, sep=';')

In [64]:
real = pd.read_csv('../data/raw/challenge_fichier_de_sortie_dentrainement_prediction_de_linteret_des_avis_utilisateurs.csv', sep=';')

In [77]:
df_tmp = real.iloc[59999:79999]

In [78]:
df_tmp.shape

(20000, 2)

In [79]:
score_function(df_tmp['Target'], submission['Target'])

0.49455470902111398