# Prediction of "lead_general"

## 0 Dependencies

In [1]:
import pandas as pd

# String Encoding
from sklearn.preprocessing import LabelEncoder

# Creation of training and testing sets
from sklearn.model_selection import train_test_split

# Creation of balanced data sets
from imblearn.under_sampling import  RandomUnderSampler
#from collections import Counter
#from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import RandomOverSampler

# PyCaret
from pycaret.classification import *

# ML algoriths
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Analysis of ML models
from sklearn.metrics import confusion_matrix

import json

from imblearn.pipeline import Pipeline
import numpy

# https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.NearMiss.html
from imblearn.under_sampling import NearMiss

## 1 Data import

In [2]:
cookie_data = pd.read_csv('./TU/Cookie_prepared.csv', sep=';', dtype='string')
cookie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252177 entries, 0 to 252176
Data columns (total 40 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   userid_1             252177 non-null  string
 1   visitsmonth1         252177 non-null  string
 2   visitsmonth3         252177 non-null  string
 3   zipcode              252177 non-null  string
 4   carsearch            252177 non-null  string
 5   conf_start_iv4       252177 non-null  string
 6   conf_start_gulfvar   252177 non-null  string
 7   conf_start_upper     252177 non-null  string
 8   conf_start_polnew    252177 non-null  string
 9   conf_start_pol       252177 non-null  string
 10  conf_start_crossing  252177 non-null  string
 11  conf_start_golf      252177 non-null  string
 12  conf_start_croc      252177 non-null  string
 13  conf_start_tour      252177 non-null  string
 14  conf_start_tiga      252177 non-null  string
 15  conf_start_pass      252177 non-nu

## 2 Preliminary Checks

As we imported the prepared data, there should be no NA values present anymore

In [3]:
NA_overview = cookie_data.isna().sum()
cookie_columns = cookie_data.columns

# Filter those entries with an aggregates sum of at least 1
list_of_NA_columns = []
for i in range(len(cookie_data.columns)):
    # Only include columns with at least 1 NA value
    if NA_overview[i] > 0:
        list_of_NA_columns.append([cookie_columns[i], NA_overview[i]])

list_of_NA_columns

[]

## 3 Encode Non-numerical Values

The prepared data was not encoded, because we would then loose the dictionaries that were used to turn strings into integers. We need to do it now and save the used dictionaries for later use.

In [4]:
cookie_data_backup = cookie_data.copy()
dictionaries = {}
le = LabelEncoder()

for col in cookie_data_backup:
    # col referes to the column names   
    try:
        cookie_data[col] = pd.to_numeric(cookie_data_backup[col])
    except:
        # We have string entries we cant turn into numericals automatically -> encode!
        cookie_data[col] = le.fit_transform(cookie_data_backup[col])
        dictionaries.update({col: dict(zip(le.classes_, le.transform(le.classes_)))})

cookie_data

Unnamed: 0,userid_1,visitsmonth1,visitsmonth3,zipcode,carsearch,conf_start_iv4,conf_start_gulfvar,conf_start_upper,conf_start_polnew,conf_start_pol,conf_start_crossing,conf_start_golf,conf_start_croc,conf_start_tour,conf_start_tiga,conf_start_pass,conf_start_art,conf_start_shar,conf_start_touar,conf_start_tig,conf_end_gulfvar,conf_end_upper,conf_end_polnew,conf_end_pol,conf_end_crossing,conf_end_iv3,conf_end_golf,conf_end_croc,conf_end_tour,conf_end_tiga,conf_end_pass,conf_end_art,conf_end_shar,conf_end_touar,conf_end_tig,days_since_visit,sessionsnbr,lead_general,lead_model1,lead_model2
0,1,0,0,4000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,139,1,0,0,0
1,2,0,0,7673,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,203,1,0,0,0
2,3,0,0,5270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,152,1,0,0,0
3,4,0,1,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43,1,0,0,0
4,5,0,0,6949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,124,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252172,252250,0,0,5998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,223,2,1,9,0
252173,252251,2,2,1844,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,2,1,13,0
252174,252252,0,0,1767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,4,1,0,0
252175,252253,0,0,2738,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,155,4,1,26,0


## 4 Create Learn and Test Sets

In [5]:
algorith_names = {
    'SGDC': 'Stochastic Gradient Descent Classifier',
    'SVC': 'Support Vector Classifier',
    'RFC': 'Random Forrest Classifier',
    'NNC': 'Neural Network Classifier',
    'GBC': 'Gradient Boosting Classifier'
}

Because the training set is highly unbalanced, we need to take certain steps to not end up with a biased model.

In [6]:
X1, y1 = cookie_data.iloc[0:,1:37], cookie_data['lead_model1']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, train_size=0.8, random_state=0)

X2, y2 = cookie_data.iloc[0:,1:37], cookie_data['lead_model2']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, train_size=0.8, random_state=0)

# 5 Train Models

From our research we got five suggestions on which classifiers to use. These are:
- Stochastic Gradient Descent Classifier
- Support Vector Classifier
- Random Forrest Classifier
- Neural Network Classifier
- Gradient Boosting Classifier

We will try all five for every of our three target variables.

In [None]:
# To run only some models with some datasets simply comment out the corresponding lines in "datasets" and "algorithms"
datasets = [
    ["lead_model1", X1, y1, X1_train, X1_test, y1_train, y1_test],
    ["lead_model2", X2, y2, X2_train, X2_test, y2_train, y2_test]
]

balancing_technique = {
    #'SMOTE': [('over', SMOTE())],
    'ROS': [('over', RandomOverSampler(random_state=0))],
    'RUS': [('under', RandomUnderSampler(random_state=0))],
    #'NM': [('under', NearMiss(n_jobs=-1))]
}

algorithms = {
    #'SGDC': [('model', SGDClassifier(max_iter=1500, n_jobs=-1, warm_start=False, class_weight='balanced'))],
    #'SVC': [('model', SVC(cache_size=1000, class_weight='balanced'))],
    'RFC': [('model', RandomForestClassifier(n_jobs=-1, warm_start=False, class_weight='balanced'))],
    #'NNC': [('model', MLPClassifier(solver='adam', warm_start=False))],
    #'GBC': [('model', GradientBoostingClassifier(warm_start=False))] # max_depthint
}

results = {
    'SGDC': [],
    'SVC': [],
    'RFC': [],
    'NNC': [],
    'GBC': []
}

useStratifiedKFold = False
for algo in algorithms:
    print(f"Running {algorith_names.get(algo)}:")
    
    for bal_tec in balancing_technique:
        print(f"\tBalancing technique {bal_tec}:")
        
        for target_variable, X, y, X_train, X_test, y_train, y_test in datasets:
            print(f"\t\tTarget variable {target_variable}:", end='')

            steps = balancing_technique.get(bal_tec) + algorithms.get(algo)
            pipeline = Pipeline(steps=steps)

            if useStratifiedKFold:
                cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)
                accuracy = numpy.mean(cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1))
                print(accuracy)

                info = {
                    'target_variable': target_variable,
                    'balancing_technique': bal_tec,
                    'accuracy': accuracy,
                }   

            else:
                print('')
                pipeline.fit(X_train, y_train)

                # https://docs.microsoft.com/en-us/azure/machine-learning/concept-manage-ml-pitfalls
                train_accuracy  = pipeline.score(X_train, y_train)
                print("\t\t\tTrain accuracy", train_accuracy)

                test_accuracy = pipeline.score(X_test, y_test)
                print("\t\t\tTest accuracy", test_accuracy)

                # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
                cm = confusion_matrix(y_test, pipeline.predict(X_test))

                info = {
                    'target_variable': target_variable,
                    'balancing_technique': bal_tec,
                    'train_accuracy': train_accuracy,
                    'test_accuracy': test_accuracy,
                    'confusion_matrix': cm,
                }
            results[algo].append(info)
    print("-----------------------------------------------")

Running Random Forrest Classifier:
	Balancing technique ROS:
		Target variable lead_model1:
			Train accuracy 0.9897541897779827
			Test accuracy 0.979478943611706
		Target variable lead_model2:
			Train accuracy 0.9997323300667688
			Test accuracy 0.9990482988341661
	Balancing technique RUS:
		Target variable lead_model1:
			Train accuracy 0.2046733187601925
			Test accuracy 0.2049329843762392
		Target variable lead_model2:
			Train accuracy 0.6192048220242786
			Test accuracy 0.6200531366484258
-----------------------------------------------
Running Gradient Boosting Classifier:
	Balancing technique ROS:
		Target variable lead_model1:


In [None]:
for i in results:
    print(i)
    for j in results[i]:
        for attr in j:
            if attr == 'confusion_matrix':
                visualization = pd.DataFrame(data=j.get(attr), index=None, columns=None)
                print(f"\t{attr}:\n{visualization}")
            else:
                print(f"\t{attr}: {j.get(attr)}")
        print('\t--------------------------------')
        
