In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
# import ensemble methods
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier

#from xgboost import XGBClassifier
# import base estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay
import seaborn as sns

In [2]:
df = pd.read_csv('conversion_data_train.csv')

In [3]:
# Suppression des lignes contenant des outliers dans Unemployment (using masks) (je garde les valeurs nulles)

print('Dropping outliers in age...')
to_keep = (df['age'] < 80) #df['age'].mean() + 3*df['age'].std())| (df['age'].isnull())
df = df.loc[to_keep,:]
print('Done. Number of lines remaining : ', df.shape[0])
print()

Dropping outliers in age...
Done. Number of lines remaining :  284578



In [4]:
X = df.drop("converted", axis=1)
y = df["converted"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.1, random_state = 0 ,stratify = y)

In [5]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


In [6]:
numeric_transformer = StandardScaler()

# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # first column will be dropped to avoid creating correlations between features

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Preprocessings on train set  
print("Performing preprocessings on train set...")  
print(X_train.head())  
X_train = preprocessor.fit_transform(X_train)  
print('...Done.')  
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore  
print()  
  
# Preprocessings on test set  
print("Performing preprocessings on test set...")  
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions  
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.  
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.  
print('...Done.')  
print(X_test[0:5,:]) 
print()

Performing preprocessings on train set...
       country  age  new_user source  total_pages_visited
208614      US   18         0    Seo                    4
31938    China   49         0    Seo                    5
192257   China   46         1    Seo                    3
206730      US   18         0    Ads                    1
7792        US   22         1    Ads                    6
...Done.
[[-1.51987065 -1.47678829 -0.26099836  0.          0.          1.
   0.          1.        ]
 [ 2.2307148  -1.47678829  0.03829244  0.          0.          0.
   0.          1.        ]
 [ 1.86775492  0.67714513 -0.56028917  0.          0.          0.
   0.          1.        ]
 [-1.51987065 -1.47678829 -1.15887077  0.          0.          1.
   0.          0.        ]
 [-1.03592414  0.67714513  0.33758325  0.          0.          1.
   0.          0.        ]]

Performing preprocessings on test set...
       country  age  new_user  source  total_pages_visited
269237   China   24         1     

In [8]:
# Perform grid search
print("Grid search...")
logistic_regression = LogisticRegression() # max_iter changed because of convergence warning
model = BaggingClassifier(logistic_regression)

# Grid of values to be tested
params = {
    'base_estimator__C': [0.01, 0.1], # base_estimator__ prefix because C is a parameter from LogisticRegression! 
    'n_estimators': [5,10] # n_estimators is a hyperparameter of the ensemble method
}
print(params)
gridsearchR = GridSearchCV(model, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearchR.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearchR.best_params_)
print("Best validation accuracy : ", gridsearchR.best_score_)
print()
print("Accuracy on training set : ", gridsearchR.score(X_train, y_train))
print("Accuracy on test set : ", gridsearchR.score(X_test, y_test))


Grid search...
{'base_estimator__C': [0.01, 0.1], 'n_estimators': [5, 10]}
...Done.
Best hyperparameters :  {'base_estimator__C': 0.1, 'n_estimators': 5}
Best validation accuracy :  0.9861510229579885

Accuracy on training set :  0.9861275964391691
Accuracy on test set :  0.9864361515215405


In [15]:
estimator= LogisticRegression(C=0.1)
Mod=BaggingClassifier(estimator,n_estimators=5)

In [10]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearchR.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearchR.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

print("f1-score on training set : ", f1_score(y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(y_test, Y_test_pred))

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on training set :  0.7608534697449014
f1-score on test set :  0.7674698795180722


In [16]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(y_train,y_test)

Mod.fit(X,Y)

In [23]:
# Predictions on training set
print("Predictions on training set...")
Y_pred = Mod.predict(X)
print("...Done.")
print(Y_train_pred)
print()


print("f1-score on training set : ", f1_score(Y, Y_pred))


Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

f1-score on training set :  0.7618759088705769


In [20]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
features_list = ['total_pages_visited','age','new_user','country', 'source']
X_without_labels = data_without_labels.loc[:, features_list]

# Convert pandas DataFrames to numpy arrays before using scikit-learn
#print("Convert pandas DataFrames to numpy arrays...")
#X_without_labels = X_without_labels.values
#print("...Done")

#print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)


In [21]:
X_without_labels = preprocessor.transform(X_without_labels)
print("...Done")
#print(X_without_labels[0:5,:])

...Done


In [22]:
data = {
    'converted': Mod.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Feriel.csv', index=False)
