In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix

# import matplotlib.pyplot as plt
# import plotly.express as px
# import plotly.graph_objects as go
# import plotly.io as pio
# setting Jedha color palette as default
# pio.templates["jedha"] = go.layout.Template(
#     layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
# )
# pio.templates.default = "jedha"
# pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE
# from IPython.display import display

In [9]:
my_filename = "v6"
enabled = False

In [10]:
data = pd.read_csv('conversion_data_train.csv')

# dropping outliers

print(data[data.age > 100])
data = data.drop(data[data.age > 100].index)
print(data[data.age > 100])

        country  age  new_user source  total_pages_visited  converted
11331        UK  111         0    Ads                   10          1
233196  Germany  123         0    Seo                   15          1
Empty DataFrame
Columns: [country, age, new_user, source, total_pages_visited, converted]
Index: []


In [11]:
Y = data['converted']
X = data.drop('converted', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1337, stratify=Y)

In [12]:
num_feat = ['age', 'total_pages_visited']
cat_feat = ['country', 'new_user', 'source']

# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
#    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)
    ])

# Preprocessings on train set

X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set

X_test = preprocessor.transform(X_test) # Don't fit again !! 


In [13]:
# Train model

# classifier = RandomForestClassifier()
# classifier.fit(X_train, Y_train)

# Perform grid search
print("Grid search...")
logistic_regression = LogisticRegression(max_iter = 1000)
adaboost_logreg = AdaBoostClassifier(logistic_regression)

# Grid of values to be tested
params = {
    'estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0],
    'n_estimators': [5, 10, 20, 40, 60, 80, 100]
}
print(params)
gridsearch = GridSearchCV(adaboost_logreg, param_grid = params, cv = 3, verbose = 1) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))

classifier = gridsearch.best_estimator_

Grid search...
{'estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0], 'n_estimators': [5, 10, 20, 40, 60, 80, 100]}
Fitting 3 folds for each of 56 candidates, totalling 168 fits
...Done.
Best hyperparameters :  {'estimator__C': 5.0, 'n_estimators': 60}
Best validation accuracy :  0.9861588292513522

Accuracy on training set :  0.9861471185381853
Accuracy on test set :  0.986682128048352


In [20]:
# Predictions

Y_train_pred = classifier.predict(X_train)
Y_test_pred = classifier.predict(X_test)

In [21]:
# Assessment

print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

# Confusion Matrices

print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

f1-score on train set :  0.7616552465403735
f1-score on test set :  0.7720986169573061
Confusion matrix on train set : 
[[246903    957]
 [  2591   5669]]

Confusion matrix on test set : 
[[27437   103]
 [  276   642]]



## Log Reg de base

f1-score on train set :  0.7626599147121536
f1-score on test set :  0.7788944723618091
Confusion matrix on train set : 
[[246837    985]
 [  2577   5723]]

Confusion matrix on test set : 
[[27486    92]
 [  260   620]]

## Log Reg stratifié

f1-score on train set :  0.7645759421648034
f1-score on test set :  0.7568223165554882
Confusion matrix on train set : 
[[246894    966]
 [  2551   5711]]

Confusion matrix on test set : 
[[27433   107]
 [  294   624]]

 ## Dec Tree de base

 f1-score on train set :  0.7949955803358943
f1-score on test set :  0.7459066100667071
Confusion matrix on train set : 
[[247259    601]
 [  2414   5846]]

Confusion matrix on test set : 
[[27424   116]
 [  303   615]]

## Random Forest de base

f1-score on train set :  0.7993611499301257
f1-score on test set :  0.759433962264151
Confusion matrix on train set : 
[[247099    761]
 [  2254   6006]]

Confusion matrix on test set : 
[[27406   134]
 [  274   644]]

-------
-------

## Model performance on official test data
## Production of file to be scored

In [22]:
enabled = True

In [23]:
# train model on whole data

if enabled:
    print("Fitting model on whole data")
    X_total = np.append(X_train,X_test,axis=0)
    Y_total = np.append(Y_train,Y_test)

    classifier.fit(X_total,Y_total)

Fitting model on whole data


In [24]:
# Preparing test data

if enabled:
    print("Preparing test data for prediction")
    data_without_labels = pd.read_csv('conversion_data_test.csv')

    # Warning : check consistency of features_list (must be the same than the features 
    # used by your best classifier)
    features_list = num_feat + cat_feat
    X_without_labels = data_without_labels[features_list]

    X_without_labels = preprocessor.transform(X_without_labels)

Preparing test data for prediction


In [25]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1

if enabled:
    data = {
        'converted': classifier.predict(X_without_labels)
    }

    Y_predictions = pd.DataFrame(columns=['converted'],data=data)
    filename = 'conversion_data_test_predictions_guillaume-' + my_filename + ".csv"
    print("Predicting test data and writing to file:", filename)
    Y_predictions.to_csv(filename, index=False)

Predicting test data and writing to file: conversion_data_test_predictions_guillaume-v6.csv
