In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score,roc_curve,f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict,KFold,train_test_split,GridSearchCV

#modeles

from sklearn import linear_model as lm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("conversion_data_train.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [None]:
dataset.head(5)

In [None]:
# Basic stats
dataset.describe(include='all')

In [None]:
dataset.converted.value_counts()

In [None]:
dataset.columns

In [None]:
dataset.source.value_counts()

In [39]:
dataset['new_age'] = 'Nan'

In [40]:
dataset['new_age'] = dataset.age.apply(lambda x: 20 if x < 20
                                    else 25 if x<25
                                    else 30 if x<30
                                    else 35 if x<35
                                    else 40 if x<40
                                    else 50 if x<50
                                    else 65)

In [None]:
dataset.total_pages_visited.value_counts()

In [41]:
dataset.new_age.value_counts()

30    63032
35    59697
25    52216
40    44203
50    37056
20    22939
65     5437
Name: new_age, dtype: int64

In [None]:
dataset.age.value_counts(sort = False)

In [None]:
sns.distplot(dataset.new_age)

In [None]:
sns.distplot?

In [None]:
sns.distplot(dataset.total_pages_visited)

In [None]:
sns.relplot(y ='converted', x = 'total_pages_visited', data = dataset)

##Preprocessing

In [6]:
features_list = ['country', 'new_age', 'new_user','source', 'total_pages_visited']
target_variable = 'converted'

In [42]:
X = dataset.loc[:, features_list]
Y = dataset.loc[:, target_variable]

print('Variables explicatives : ', X.columns)
print()

Variables explicatives :  Index(['country', 'new_age', 'new_user', 'source', 'total_pages_visited'], dtype='object')



In [43]:
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
  if ('float' in str(t)) or ('int' in str(t)) :
    numeric_features.append(i)
    numeric_indices.append(idx)
  else :
    categorical_features.append(i)
    categorical_indices.append(idx)

  idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['new_age', 'new_user', 'total_pages_visited']  at positions  [1, 2, 4]
Found categorical features  ['country', 'source']  at positions  [0, 3]


In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.15, 
                                                    random_state=0,
                                                    stratify=Y)

In [45]:
# Encoding categorical features and standardizing numerical features
# Normalization
print("Avant")
print(X_train.head())
print(X_test.head())
numeric_transformer = StandardScaler()
# OHE / dummyfication
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

X_train = featureencoder.fit_transform(X_train)
X_test = featureencoder.transform(X_test)
print("Après")
print(X_train[0:5,:])
print(X_test[0:5,:])

Avant
       country  new_age  new_user source  total_pages_visited
44620       UK       30         1    Seo                    9
46076    China       30         1    Seo                    2
209022      US       35         0    Seo                    3
65284       US       40         1    Seo                    2
24604       US       20         1    Seo                    1
       country  new_age  new_user  source  total_pages_visited
229800      US       25         0  Direct                   10
148977   China       20         0     Seo                    2
201871      US       50         0     Ads                    7
101289      US       25         1     Seo                    8
164682      US       25         1     Ads                    2
Après
[[ 0.          1.          0.          0.          1.         -0.43394981
   0.67658432  1.23430796]
 [ 0.          0.          0.          0.          1.         -0.43394981
   0.67658432 -0.86031225]
 [ 0.          0.          1.       

In [46]:
## instanciation des modèles de classification:

#Lineaires

my_logisticReg = lm.LogisticRegression(penalty="none",solver="lbfgs")
my_logisticRegCV = lm.LogisticRegressionCV()
my_ridgeClass = lm.RidgeClassifier()
my_ridgeClassCV = lm.RidgeClassifierCV()

my_decisionTree = DecisionTreeClassifier()
my_randomForest = RandomForestClassifier()
#mybest_randomForest = RandomForestClassifier(n_estimators=400,max_depth=110,min_samples_leaf=8,min_samples_split=10)
mybest_logisticReg = lm.LogisticRegression(penalty="l1",C=0.23357214690901212)


In [47]:
my_logisticReg.fit(X_train,Y_train)
my_logisticRegCV.fit(X_train,Y_train)
my_ridgeClass.fit(X_train,Y_train)
my_ridgeClassCV.fit(X_train,Y_train)

my_decisionTree.fit(X_train,Y_train)
my_randomForest.fit(X_train,Y_train)

#mybest_randomForest.fit(X_train,Y_train)
mybest_logisticReg.fit(X_train,Y_train)




LogisticRegression(C=0.23357214690901212, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
### Grid Search sur Logistic Regression


print("Train model...")

classifier2 = lm.LogisticRegression()

params = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20)
}

best_classifier_Log = GridSearchCV(classifier2, params,n_jobs=-1,verbose=2)
%time best_classifier_Log.fit(X_train, Y_train)
print("...Done.")

In [None]:
### Grid Search sur RandomForest


print("Train model...")

classifier = RandomForestClassifier()

params = {
    'n_estimators' : [50,100,200,400],
    'max_depth' : [40,70,110],
    'min_samples_leaf': [2,4,8],
    'min_samples_split': [8,10,12],
}

best_classifier_rndmF = GridSearchCV(classifier, params,n_jobs=-1,verbose=2)
%time best_classifier_rndmF.fit(X_train, Y_train)
print("...Done.")

In [13]:
list_model= ['my_logisticReg',
             'my_logisticRegCV',
            'my_ridgeClass',
            'my_ridgeClassCV',
            'my_decisionTree',
            'my_randomForest',
            'mybest_logisticReg']

In [15]:
results = pd.DataFrame(columns=['model','f1_score_test','f1_score_train','accuracy_score_test','accuracy_score_train','run_nb'])

In [36]:
run_number = 0

In [48]:
#results = pd.DataFrame(columns=['model','f1_score_test','f1_score_train','accuracy_score_test','accuracy_score_train'])
run_number = run_number +1
for model_i in list_model:
    print(model_i)
    Y_train_predict_i = eval(model_i).predict(X_train)
    Y_test_predict_i = eval(model_i).predict(X_test)
    f1_score_i_train = f1_score(Y_train,Y_train_predict_i)
    f1_score_i_test = f1_score(Y_test,Y_test_predict_i)
    score_i_train = accuracy_score(Y_train,Y_train_predict_i)
    score_i_test = accuracy_score(Y_test,Y_test_predict_i)
    results =results.append({'model':model_i,
                             'f1_score_test':"{:0.4}".format(f1_score_i_test),
                             'f1_score_train':"{:0.4}".format(f1_score_i_train),
                             'accuracy_score_test':"{:.3%}".format(score_i_test),
                            'accuracy_score_train':"{:.3%}".format(score_i_train),
                            'run_nb':run_number},
                            ignore_index=True)

my_logisticReg
my_logisticRegCV
my_ridgeClass
my_ridgeClassCV
my_decisionTree
my_randomForest
mybest_logisticReg


In [49]:
results.sort_values(by='f1_score_test',ascending=False)

Unnamed: 0,model,f1_score_test,f1_score_train,accuracy_score_test,accuracy_score_train,run_nb
0,my_logisticReg,0.7643,0.7652,98.620%,98.630%,new_age2
1,my_logisticRegCV,0.7643,0.7652,98.620%,98.630%,new_age2
14,my_logisticReg,0.7622,0.764,98.616%,98.623%,new_age3
22,my_logisticRegCV,0.7622,0.7639,98.616%,98.622%,1
7,my_logisticReg,0.7622,0.764,98.616%,98.623%,new_age2
8,my_logisticRegCV,0.7622,0.7639,98.616%,98.622%,new_age2
21,my_logisticReg,0.7622,0.764,98.616%,98.623%,1
15,my_logisticRegCV,0.7622,0.7639,98.616%,98.622%,new_age3
27,mybest_logisticReg,0.7616,0.7641,98.611%,98.623%,1
6,mybest_logisticReg,0.7616,0.7623,98.611%,98.621%,new_age2


In [None]:
## affichage des confusion matrix:
for model_j in list_model:
    Y_test_pred_i = eval(model_j).predict(X_test)
    #sns.heatmap(confusion_matrix(Y_test, Y_test_pred_i),annot=True, ax=ax)
    print("Confusion Matrix de ",model_j)
    print(confusion_matrix(Y_test,Y_test_pred_i))
    

In [None]:
best_classifier_rndmF.score(X_test,Y_test)

In [None]:
best_classifier_rndmF.best_params_

In [None]:
best_classifier_Log.best_params_

In [None]:
best_classifier.best_score_

###  train on all data and make prediction without labels

In [19]:
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

In [20]:
my_logisticReg.fit(X,Y)
my_logisticRegCV.fit(X,Y)
my_ridgeClass.fit(X,Y)
my_ridgeClassCV.fit(X,Y)

my_decisionTree.fit(X,Y)
#my_randomForest.fit(X,Y)

mybest_logisticReg.fit(X,Y)



LogisticRegression(C=0.23357214690901212, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
best_classifier_rndmF.fit(X,Y)

In [21]:
# Read data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)
data_without_labels['new_age'] = 'NaN'
data_without_labels['new_age'] = data_without_labels.age.apply(lambda x: 20 if x < 20
                                    else 25 if x<25
                                    else 30 if x<30
                                    else 35 if x<35
                                    else 40 if x<40
                                    else 50 if x<50
                                    else 65)

X_without_labels = data_without_labels.loc[:,features_list]

print(X_without_labels.head(5))



# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)

# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = featureencoder.transform(X_without_labels)
print("...Done")

print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
  country  new_age  new_user  source  total_pages_visited
0      UK       30         0     Seo                   16
1      UK       25         1  Direct                    5
2   China       35         1     Seo                    1
3      US       35         1     Ads                    6
4   China       30         0     Seo                    3
Convert pandas DataFrames to numpy arrays...
...Done
[[ 0.          1.          0.          0.          1.         -0.45676765
  -1.47801238  3.32892817]
 [ 0.          1.          0.          1.          0.         -0.87964634
   0.67658432  0.03738213]
 [ 0.          0.          0.          0.          1.         -0.03388896
   0.67658432 -1.1595437 ]
 [ 0.          0.          1.          0.          0.         -0.03388896
   0.67658432  0.33661359]
 [ 0.          0.          0.          0.          1.         -0.45676765
  -1.47801238 -0.56108079]]


In [22]:
## extraction des résultats

data = {
    'converted': my_logisticReg.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Xavier-mylogisticReg-3.csv', index=False)
