In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix

# import matplotlib.pyplot as plt
# import plotly.express as px
# import plotly.graph_objects as go
# import plotly.io as pio
# setting Jedha color palette as default
# pio.templates["jedha"] = go.layout.Template(
#     layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
# )
# pio.templates.default = "jedha"
# pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE
# from IPython.display import display

In [28]:
my_filename = "v5"
enabled = False

In [29]:
data = pd.read_csv('conversion_data_train.csv')

# dropping outliers

print(data[data.age > 100])
data = data.drop(data[data.age > 100].index)
print(data[data.age > 100])

        country  age  new_user source  total_pages_visited  converted
11331        UK  111         0    Ads                   10          1
233196  Germany  123         0    Seo                   15          1
Empty DataFrame
Columns: [country, age, new_user, source, total_pages_visited, converted]
Index: []


In [30]:
Y = data['converted']
X = data.drop('converted', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1337, stratify=Y)

In [31]:
num_feat = ['age', 'total_pages_visited']
cat_feat = ['country', 'new_user', 'source']

# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
#    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
#    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ('encoder', OneHotEncoder())
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)
    ])

# Preprocessings on train set

X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set

X_test = preprocessor.transform(X_test) # Don't fit again !! 


In [32]:
# Train model

model = LogisticRegression()

# Best: 0.762456 using {'C': 1000.0, 'penalty': 'l2', 'solver': 'newton-cg'}
# Best: 0.762456 using {'C': 1000.0, 'penalty': 'l1', 'solver': 'saga'}
# Best: 0.762456 using {'C': 10000.0, 'penalty': 'l1', 'solver': 'saga'}

# solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
# penalty = ['l2', 'elasticnet', 'none']
# c_val = [100., 10., 1., 0.1, 0.01]
solvers = ['saga', 'newton-cg']
penalty = ['elasticnet', 'l1', 'l2', None]
c_val = [10_000., 5_000., 2_000., 1_000., 500., 200., 100., 50., 10., 0.1, 0.01]
params = dict(solver=solvers, penalty=penalty, C=c_val)

grid_search = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=3, scoring='f1', verbose=2)
grid_result = grid_search.fit(X_train, Y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

classifier = grid_result.best_estimator_

Fitting 3 folds for each of 88 candidates, totalling 264 fits


99 fits failed out of a total of 264.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Roumegaire\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Roumegaire\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Roumegaire\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call

Best: 0.762456 using {'C': 10000.0, 'penalty': 'l1', 'solver': 'saga'}


In [33]:
# Predictions

Y_train_pred = classifier.predict(X_train)
Y_test_pred = classifier.predict(X_test)

In [34]:
# Assessment

print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

# Confusion Matrices

print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

f1-score on train set :  0.7623842126459928
f1-score on test set :  0.7743865948533812
Confusion matrix on train set : 
[[246901    959]
 [  2581   5679]]

Confusion matrix on test set : 
[[27434   106]
 [  271   647]]



In [35]:
classifier.coef_

array([[-0.61368843,  2.5252222 , -3.57328993,  0.26281682,  0.11336857,
        -0.2541927 , -0.86540835, -2.58580426, -1.06709957, -1.27911221,
        -1.10495807]])

## Log Reg de base

f1-score on train set :  0.7626599147121536
f1-score on test set :  0.7788944723618091
Confusion matrix on train set : 
[[246837    985]
 [  2577   5723]]

Confusion matrix on test set : 
[[27486    92]
 [  260   620]]

## Log Reg stratifié

f1-score on train set :  0.7645759421648034
f1-score on test set :  0.7568223165554882
Confusion matrix on train set : 
[[246894    966]
 [  2551   5711]]

Confusion matrix on test set : 
[[27433   107]
 [  294   624]]

 ## Dec Tree de base

 f1-score on train set :  0.7949955803358943
f1-score on test set :  0.7459066100667071
Confusion matrix on train set : 
[[247259    601]
 [  2414   5846]]

Confusion matrix on test set : 
[[27424   116]
 [  303   615]]

## Random Forest de base

f1-score on train set :  0.7993611499301257
f1-score on test set :  0.759433962264151
Confusion matrix on train set : 
[[247099    761]
 [  2254   6006]]

Confusion matrix on test set : 
[[27406   134]
 [  274   644]]

-------
-------

## Model performance on official test data
## Production of file to be scored

In [36]:
enabled = False

In [37]:
# train model on whole data

if enabled:
    print("Fitting model on whole data")
    X_total = np.append(X_train,X_test,axis=0)
    Y_total = np.append(Y_train,Y_test)

    classifier.fit(X_total,Y_total)

In [38]:
# Preparing test data

if enabled:
    print("Preparing test data for prediction")
    data_without_labels = pd.read_csv('conversion_data_test.csv')

    # Warning : check consistency of features_list (must be the same than the features 
    # used by your best classifier)
    features_list = num_feat + cat_feat
    X_without_labels = data_without_labels[features_list]

    X_without_labels = preprocessor.transform(X_without_labels)

In [39]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1

if enabled:
    data = {
        'converted': classifier.predict(X_without_labels)
    }

    Y_predictions = pd.DataFrame(columns=['converted'],data=data)
    filename = 'conversion_data_test_predictions_guillaume-' + my_filename + ".csv"
    print("Predicting test data and writing to file:", filename)
    Y_predictions.to_csv(filename, index=False)