In [5]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import sklearn


from sklearn.preprocessing import normalize
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.cross_validation import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

import utils_veolia


In [4]:
input_raw, output_raw, input_submission = utils_veolia.load_data()
input_preprocessed = utils_veolia.preprocess(input_raw, year=2014, more_features=True)
input_train, output_train, input_test, output_test = utils_veolia.split_train_test_Kfold(output_raw, input_preprocessed)

## SMOTE exemple

In [6]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import  SMOTE 

In [8]:
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [13]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({0: 900, 1: 900})


## SMOTE applied

In [26]:
sm = SMOTE(random_state=42)
X_balanced, y_balanced = sm.fit_sample(input_train, output_train['2014'])

## Test on classifier

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [62]:
RFC = RandomForestClassifier(n_estimators=10, max_depth=20)
GBC = GradientBoostingClassifier(n_estimators=10, max_depth=20)
ABC = AdaBoostClassifier(n_estimators=10)

In [63]:
classifier = ABC

In [64]:
YEAR = 2014
predict_proba = False

# Basic preprocessing
input_preprocessed = utils_veolia.preprocess(input_raw, year=YEAR, more_features=True)
input_train, output_train, input_test, output_test = utils_veolia.split_train_test_Kfold(output_raw, input_preprocessed)

# SMOTE oversampling
sm = SMOTE(random_state=42)
X_balanced, y_balanced = sm.fit_sample(input_train, output_train[str(YEAR)])

X_test = input_test
Y_test_2014 = utils_veolia.preprocess_output(output_test, year = YEAR)

rdm_1 = classifier
rdm_1.fit(X_balanced, y_balanced)

if predict_proba:
    y_pred_2014 = rdm_1.predict_proba(X_test)
    y_pred_2014 = y_pred_2014[:,1]
else:
    y_pred_2014 = rdm_1.predict(X_test)
    print(classification_report(Y_test_2014,y_pred_2014))

             precision    recall  f1-score   support

          0       1.00      0.83      0.91      9689
          1       0.01      0.67      0.02        24

avg / total       1.00      0.83      0.91      9713



In [65]:
YEAR = 2015
# Basic preprocessing
input_preprocessed = utils_veolia.preprocess(input_raw, year=YEAR, more_features=True)
input_train, output_train, input_test, output_test = utils_veolia.split_train_test_Kfold(output_raw, input_preprocessed)

# SMOTE oversampling
sm = SMOTE(random_state=42)
X_balanced, y_balanced = sm.fit_sample(input_train, output_train[str(YEAR)])

X_test = input_test
Y_test_2015 = utils_veolia.preprocess_output(output_test, year = YEAR)

rdm_1 = classifier
rdm_1.fit(X_balanced, y_balanced)

if predict_proba:
    y_pred_2015 = rdm_1.predict_proba(X_test)
    y_pred_2015 = y_pred_2014[:,1]
else:
    y_pred_2015 = rdm_1.predict(X_test)
    print(classification_report(Y_test_2015,y_pred_2015))

             precision    recall  f1-score   support

          0       1.00      0.86      0.93      9696
          1       0.01      0.47      0.01        17

avg / total       1.00      0.86      0.92      9713



In [66]:
pred = np.array([y_pred_2014,y_pred_2015]).T
true = np.array([Y_test_2014,Y_test_2015]).T

utils_veolia.score_function(true, pred)

0.71630707191781262

In [67]:
input_train

Unnamed: 0_level_0,Feature3,Length,Age,YearsOldLastFailure,P,T,IAB,O,U,C,...,DrM,MP,MT,MIAB,MO,MU,MC,MD,MDr,MM
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.015677,0.000557,0.014094,-0.007229,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
3,-0.003116,0.000545,0.003382,-0.007229,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.003653,0.000256,0.004228,-0.007229,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,-0.001506,0.000532,0.010147,-0.007229,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,-0.003116,0.006874,0.012684,-0.007229,0,1,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
8,-0.003116,0.000767,0.015785,-0.007229,0,1,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
9,-0.003116,0.000562,0.015785,-0.007229,0,1,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
10,-0.003116,0.000175,0.015785,-0.007229,0,1,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
11,0.004938,0.000865,0.004510,-0.007229,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15,0.042524,0.001255,0.000564,-0.007229,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1


In [68]:
X_balanced

array([[  1.56768266e-02,   5.56990176e-04,   1.40936103e-02, ...,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [ -3.11640805e-03,   5.44929176e-04,   3.38246647e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -3.65335761e-03,   2.56303122e-04,   4.22808309e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [ -1.79006882e-03,   5.10854386e-03,   4.53015710e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.25308757e-03,   1.50279987e-03,   6.48306074e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -2.64965335e-03,   1.74227628e-02,   8.54278261e-03, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [69]:
y_balanced

array([0, 0, 0, ..., 1, 1, 1])

In [70]:
output_train

Unnamed: 0_level_0,2014,2015
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0,0
3,0,0
4,0,0
6,0,0
7,0,0
8,0,0
9,0,0
10,0,0
11,0,0
15,0,0
