In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0,"Loading/")
sys.path.insert(0,"Preprocessing/")
sys.path.insert(0,"Modeling/")
sys.path.insert(0,"Evaluation/")
sys.path.insert(0,"Interpretability/")
sys.path.insert(0,"Monitoring/")
sys.path.insert(0,"Utils/")


import loading
import preprocessing
import modeling
import evaluation
import interpretability
import monitoring 
import utils as u

import json 
import pandas as pd
import numpy as np

#Package allowing to reload a package (or a script), without having to reload the whole notebook
#import importlib #no use if the magic commands in the first cell are working

In [3]:
# Here we find the parameters needed for the run. You have to launch this cell only if you do not launch the main script.

path_conf ="../params/conf/conf.json"

# path_conf ='../conf/conf.json'
conf = json.load(open(path_conf, 'r'))

path_log = conf['path_log'] # "../log/my_log_file.txt"
log_level = conf['log_level'] # "DEBUG"

# Be careful to launch the logger only once, otherwise each lines will be duplicated
logger = u.my_get_logger(path_log, log_level, my_name="main_logger")

In [4]:
#Reload of the conf file (useful when you do modifications)
conf = json.load(open(path_conf, 'r'))

## Loading and Preprocessing

In [5]:
#Reading of the dataset selected in the conf file
df = loading.read_csv_from_name(conf)

In [6]:
df.columns

Index(['Unnamed: 0', 'ID', 'Year_Birth', 'Education', 'Marital_Status',
       ' Income ', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Response', 'Complain', 'Country'],
      dtype='object')

In [7]:
#Preprocessing of the selected dataset
df_preprocessed, X_columns, y_column = preprocessing.main_preprocessing_from_name(df,conf)

#Writting of the preprocessed dataset
loading.write_preprocessed_csv_from_name(df_preprocessed,conf)

'OK'

In [8]:
#Basic Splitting between train and test
X_train, X_test, y_train, y_test = preprocessing.basic_split( df_preprocessed , 0.25 , X_columns, y_column)

## Modeling

In [9]:
#Modelisation using the model selected in the conf file
clf, best_params = modeling.main_modeling_from_name(X_train,y_train,conf)

#Saving the model
u.save_model(clf, conf)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    3.7s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.8s finished


'OK'

## Evaluation

In [10]:
#Independent step from the other, we reload what we need:

#Loading of the model
clf = u.load_model(conf)
#Loading of the preprocessed dataset
df = loading.load_preprocessed_csv_from_name(conf)

#Basic Splitting:
y_column = u.get_y_column_from_conf(conf)
X_columns = [x for x in df.columns if x != y_column ]
X_train, X_test, y_train, y_test = preprocessing.basic_split( df , 0.25 , X_columns, y_column)

../Outputs/Models/marketing_random_forest.sav


In [11]:
#Computing metrics
dict_metrics = evaluation.main_evaluation(clf, X_test, y_test, conf)
dict_metrics

{'f1_score': 0.39285714285714285,
 'accuracy': 0.875,
 'recall': 0.2972972972972973,
 'precision': 0.5789473684210527,
 'confusion_matrix': {'tn': 227, 'fp': 8, 'fn': 26, 'tp': 11}}

In [12]:
# Here we find the parameters needed for the run. You have to launch this cell only if you do not launch the main script.

path_conf ="../params/conf/conf_2.json"
conf = json.load(open(path_conf, 'r'))

path_log = conf['path_log'] # "../log/my_log_file.txt"
log_level = conf['log_level'] # "DEBUG"

# Be careful to launch the logger only once, otherwise each lines will be duplicated
logger = u.my_get_logger(path_log, log_level, my_name="main_logger")

In [13]:
#Reload of the conf file (useful when you do modifications)
conf = json.load(open(path_conf, 'r'))

## Loading and Preprocessing

In [14]:

conf["paths"]["Inputs_path"]+ conf["dict_info_files"][conf['selected_dataset']]["path_file"]

'../Inputs/marketing_data_2.csv'

In [15]:
#Reading of the dataset selected in the conf file
df2 = loading.read_csv_from_name(conf)

In [16]:
#Preprocessing of the selected dataset
df2_preprocessed, X_columns, y_column = preprocessing.main_preprocessing_from_name(df2,conf)

#Writting of the preprocessed dataset
loading.write_preprocessed_csv_from_name(df2_preprocessed,conf)

'OK'

## Evaluation

In [17]:
#Independent step from the other, we reload what we need:

#Loading of the model
clf = u.load_model(conf,name="marketing_random_forest")
#Loading of the preprocessed dataset
df2 = loading.load_preprocessed_csv_from_name(conf)

#Basic Splitting:
y_column = u.get_y_column_from_conf(conf)
X_columns = [x for x in df.columns if x != y_column ]


../Outputs/Models/marketing_random_forest.sav


In [18]:
#Computing metrics
batches= monitoring.create_batches(df2,4)
for batch in batches:
    y_monitored= batch[y_column]
    X_monitored= batch.drop(y_column,axis=1)
    dict_metrics = evaluation.main_evaluation(clf, X_monitored, y_monitored, conf)
    print(dict_metrics)


{'f1_score': 0.594059405940594, 'accuracy': 0.8581314878892734, 'recall': 0.45454545454545453, 'precision': 0.8571428571428571, 'confusion_matrix': {'tn': 218, 'fp': 5, 'fn': 36, 'tp': 30}}
{'f1_score': 0.26229508196721313, 'accuracy': 0.84375, 'recall': 0.18181818181818182, 'precision': 0.47058823529411764, 'confusion_matrix': {'tn': 235, 'fp': 9, 'fn': 36, 'tp': 8}}
{'f1_score': 0.375, 'accuracy': 0.8958333333333334, 'recall': 0.28125, 'precision': 0.5625, 'confusion_matrix': {'tn': 249, 'fp': 7, 'fn': 23, 'tp': 9}}
{'f1_score': 0.23809523809523808, 'accuracy': 0.8888888888888888, 'recall': 0.22727272727272727, 'precision': 0.25, 'confusion_matrix': {'tn': 251, 'fp': 15, 'fn': 17, 'tp': 5}}


## Interpretabily 

In [25]:
monitoring.main_monitoring(df,batches)

batch 1
	 ->Format check_set_columns: OK,
	 ->check_nb_nan:
	 ->kolmogorov_smirnov:
	 	 -NumDealsPurchases
	 	 -Response
	 	 -Complain
	 ->t_test:
	 	 -NumDealsPurchases
	 	 -Complain
	 ->levene_test:
	 	 -NumDealsPurchases
	 	 -NumStorePurchases
	 	 -Complain
**************************************** 

batch 2
	 ->Format check_set_columns: OK,
	 ->check_nb_nan:
	 ->kolmogorov_smirnov:
	 	 -Response
	 	 -Complain
	 ->t_test:
	 	 -Response
	 	 -Complain
	 ->levene_test:
	 	 -Teenhome
	 	 -NumDealsPurchases
	 	 -Response
	 	 -Complain
**************************************** 

batch 3
	 ->Format check_set_columns: OK,
	 ->check_nb_nan:
	 ->kolmogorov_smirnov:
	 	 -NumDealsPurchases
	 	 -Response
	 	 -Complain
	 ->t_test:
	 	 -Complain
	 ->levene_test:
	 	 -NumDealsPurchases
	 	 -NumStorePurchases
	 	 -Response
	 	 -Complain
**************************************** 

batch 4
	 ->Format check_set_columns: OK,
	 ->check_nb_nan:
	 ->kolmogorov_smirnov:
	 	 -NumDealsPurchases
	 	 -Response
	 	

## Monitoring