# Prediction of "lead_general"

## Dependencies

In [None]:
import pandas as pd

# String Encoding
from sklearn.preprocessing import LabelEncoder

# Creation of training and testing sets
from sklearn.model_selection import train_test_split

# Creation of balanced data sets
from imblearn.under_sampling import  RandomUnderSampler
#from collections import Counter
#from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import RandomOverSampler

# PyCaret
from pycaret.classification import *

# ML algoriths
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Analysis of ML models
from sklearn.metrics import confusion_matrix

import json


import numpy

# https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.NearMiss.html
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

## Data import

In [None]:
cookie_data = pd.read_csv('./TU/Cookie_prepared.csv', sep=';', dtype='string')
cookie_data.info()

## Preliminary Checks

As we imported the prepared data, there should be no NA values present anymore

In [None]:
NA_overview = cookie_data.isna().sum()
cookie_columns = cookie_data.columns

# Filter those entries with an aggregates sum of at least 1
list_of_NA_columns = []
for i in range(len(cookie_data.columns)):
    # Only include columns with at least 1 NA value
    if NA_overview[i] > 0:
        list_of_NA_columns.append([cookie_columns[i], NA_overview[i]])

list_of_NA_columns

## Encode Non-numerical Values

The prepared data was not encoded, because we would then loose the dictionaries that were used to turn strings into integers. We need to do it now and save the used dictionaries for later use.

In [None]:
cookie_data_backup = cookie_data.copy()
dictionaries = {}
le = LabelEncoder()

for col in cookie_data_backup:
    # col referes to the column names   
    try:
        cookie_data[col] = pd.to_numeric(cookie_data_backup[col])
    except:
        # We have string entries we cant turn into numericals automatically -> encode!
        cookie_data[col] = le.fit_transform(cookie_data_backup[col])
        dictionaries.update({col: dict(zip(le.classes_, le.transform(le.classes_)))})

cookie_data

## Getting An Overview

From our research we got five suggestions on which classifiers to use. These are:
- Stochastic Gradient Descent Classifier
- Support Vector Classifier
- Random Forrest Classifier
- Neural Network Classifier
- Gradient Boosting Classifier

We will try to get an overview over the best performing algorithms and comapre these with our list.

In [None]:
pycaret_data_lead_general = cookie_data.drop(columns=['lead_model1', 'lead_model2'])
exp_name_lead_general = setup(data=pycaret_data_lead_general, target='lead_general', session_id=5040, use_gpu=True, silent=True)

In [None]:
best_model_lead_model = compare_models()

## Creating Random Forrest Classifier and Gradient Boosting Classifier Model

In [None]:
RFC_model = create_model('rf')

In [None]:
plot_model(RFC_model, plot = 'confusion_matrix')

In [None]:
GBC_model = create_model('gbc')

In [None]:
plot_model(GBC_model, plot = 'confusion_matrix')

## Tuning the Random Forrest Model Towards F1 and AUC

In [None]:
F1_tuned_RFC_model, F1_RFC_tuner = tune_model(RFC_model, optimize='F1', return_tuner=True)

In [None]:
plot_model(F1_tuned_RFC_model, plot = 'confusion_matrix')

In [None]:
AUC_tuned_GBC_model, AUC_GBC_tuner = tu2221998
ne_model(RFC_model, optimize='AUC', return_tuner=True)

In [None]:
plot_model(AUC_tuned_GBC_model, plot = 'confusion_matrix')

# Tune Models Towards Recall

In [None]:
recall_tuned_RFC_model, recall_RFC_tuner = tune_model(RFC_model, optimize='Recall', return_tuner=True)

In [None]:
plot_model(recall_tuned_RFC_model, plot = 'confusion_matrix')

In [None]:
recall_tuned_GBC_model, recall_GBC_tuner = tune_model(RFC_model, optimize='Recall', return_tuner=True)

In [None]:
plot_model(recall_tuned_GBC_model, plot = 'confusion_matrix')

We can see, that the Random Forrest Classifier and Gradient Boosting Classifier are the best algorithms with 99.57% accuracy once they are tuned. We still have a false negative rate of around 30% no matter the tuning. Reducing this value is open for further work.

To be able to rebuild the model created by PyCaret, we output their configuration.

In [None]:
print(F1_RFC_tuner)

In [None]:
print(F1_tuned_RFC_model)

In [None]:
print(AUC_GBC_tuner)

In [None]:
print(AUC_tuned_GBC_model)