# Classes et méthodes

In [181]:
import pandas as pd
import numpy as np
df_se = pd.read_csv('socio_eco.csv', sep=';')
df_data = pd.read_csv('data.csv')
new_df = pd.read_csv('end_data.csv')

In [182]:
class DataCleaner_1:
    """To clean data before selecting features.
    """
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def cleaning_socio_eco(self):
        """Necessary file transformations"""
        
        # Reading and format
        df = pd.read_csv(self.filename, sep=';')
        df['DATE'] = pd.to_datetime(df['DATE'])
        df['YEAR/MONTH'] = df['DATE'].dt.to_period('M')
        
        # Nan Imputer
        df['EMPLOYMENT_VARIATION_RATE'] = df['EMPLOYMENT_VARIATION_RATE'].fillna(df['EMPLOYMENT_VARIATION_RATE'].median())
        df['IDX_CONSUMER_PRICE'] = df['IDX_CONSUMER_PRICE'].fillna(df['IDX_CONSUMER_PRICE'].mean())
        df['IDX_CONSUMER_CONFIDENCE'] = df['IDX_CONSUMER_CONFIDENCE'].fillna(df['IDX_CONSUMER_CONFIDENCE'].median())
        
        return df
    
dc1 = DataCleaner_1('socio_eco.csv')
cleaned_1 = dc1.cleaning_socio_eco()

In [183]:
class DataCleaner_2:
    """To clean data before selecting features.
    """
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def cleaning_data(self):
        """Necessary file transformations"""
        
        # Reading and format
        df = pd.read_csv(self.filename, sep=';')
        df['DATE'] = pd.to_datetime(df['DATE'])
        df['YEAR/MONTH'] = df['DATE'].dt.to_period('M')
        
        # Nan imputer
        df['CONTACT'] = df['CONTACT'].fillna('Portable')
        df['JOB_TYPE'] = df['JOB_TYPE'].fillna('Col bleu')
        df['RESULT_LAST_CAMPAIGN'] = df["RESULT_LAST_CAMPAIGN"].fillna("No Contact")
        df['STATUS'] = df['STATUS'].fillna('Marié')
        df['EDUCATION'] = df['EDUCATION'].fillna('Secondaire')
        df['HAS_PERSO_LOAN'] = df['HAS_PERSO_LOAN'].fillna('No')
        
        return df

dc2 = DataCleaner_2('data.csv')
cleaned_2 = dc2.cleaning_data()

In [184]:
class DataMerger:
    """To merge dataframes before feature engineering
    """
    def __init__(self, filename1, filename2):
        """Class constructor"""
        self.filename1 = filename1
        self.filename2 = filename2
    
    def merging_data(self):
        """merging two different datasets"""
        res = pd.merge(self.filename1.assign(grouper=self.filename1['YEAR/MONTH']),
                       self.filename2.assign(grouper=self.filename2['YEAR/MONTH']),
                       how='right', on='grouper')
        res = res.drop(columns=['YEAR/MONTH_y', 'YEAR/MONTH_x' , 'DATE_y' , 'grouper'])
        res = res.rename(columns={'DATE_x': 'DATE'})
        return res

dm = DataMerger(cleaned_1,cleaned_2)
res = dm.merging_data()

In [185]:
class FrequencyEncoder:
    """To encode categorical variables"""
    
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def frequency_encoding(self):
        """Map values by frequencies"""
        df = self.filename
        df['STATUS'] = df['STATUS'].map(df.groupby('STATUS').size()/len(df))
        df['EDUCATION'] = df['EDUCATION'].map(df.groupby('EDUCATION').size()/len(df))
        df['CONTACT'] = df['CONTACT'].map(df.groupby('CONTACT').size()/len(df))
        return df

fe = FrequencyEncoder(res)
fe1 = fe.frequency_encoding()

In [186]:
class BinaryEncoder:
    """To encode categorical variables"""
    
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def binary_encoding(self):
        """Map values by frequencies"""
        
        df = self.filename
        dict_bin = {'Yes':1, 'No':0}
        df.replace({'HAS_PERSO_LOAN' : dict_bin}, inplace = True)
        df.replace({'HAS_DEFAULT' : dict_bin}, inplace = True)
        df.replace({'HAS_HOUSING_LOAN' : dict_bin}, inplace = True)
        df.replace({'SUBSCRIPTION' : dict_bin}, inplace=True)
        return df

be = BinaryEncoder(fe1)
be1 = be.binary_encoding()

In [187]:
class LabelEncoder:
    """To encode categorical variables"""
    
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def label_encoding(self):
        """Map values by frequencies"""
        
        df = self.filename
        dict_cat = {'Manager':1, 'Technicien':2, 'Entrepreuneur':3,
                    'Col bleu':4, 'Retraité':5, 'Admin':6, 
                    'Services':7, 'Indépendant':8, 'Chomeur':9,
                    'Employé de ménage':10,'Etudiant' :11}
        df.replace({'JOB_TYPE' : dict_cat}, inplace = True)
        return df

le = LabelEncoder(be1)
le1 = le.label_encoding()

In [159]:
class FeatureSelector:
    """To do feature_engineering"""
    
    def __init__(self, filename):
        """Class constructor"""
        self.filename = filename
    
    def feature_engineering(self):
        """Transform features"""
        
        #Transformation d'une feature
        df = self.filename
        df.replace({'RESULT_LAST_CAMPAIGN' : {'Autre': 'No Contact'}}, inplace = True) 
        df = pd.get_dummies(df , columns=["RESULT_LAST_CAMPAIGN"], prefix=["LAST_CAMPAIGN_IS"])
        df['LAST_CAMPAIGN_IS_Echec'] = df['LAST_CAMPAIGN_IS_Echec'].astype(float)
        df['LAST_CAMPAIGN_IS_No Contact'] = df['LAST_CAMPAIGN_IS_No Contact'].astype(float)
        df['LAST_CAMPAIGN_IS_Succes'] = df['LAST_CAMPAIGN_IS_Succes'].astype(float)
        
        #Suppression de lignes insensées
        df= df[df['AGE'] != 123]
        return df

fs = FeatureSelector(le1)
fs1 = fs.feature_engineering()

In [189]:
fs = FeatureSelector(le1)
fs1 = fs.feature_engineering()
fs1

Unnamed: 0,DATE,EMPLOYMENT_VARIATION_RATE,IDX_CONSUMER_PRICE,IDX_CONSUMER_CONFIDENCE,AGE,JOB_TYPE,STATUS,EDUCATION,HAS_DEFAULT,BALANCE,...,HAS_PERSO_LOAN,CONTACT,DURATION_CONTACT,NB_CONTACT,NB_DAY_LAST_CONTACT,NB_CONTACT_LAST_CAMPAIGN,SUBSCRIPTION,LAST_CAMPAIGN_IS_Echec,LAST_CAMPAIGN_IS_No Contact,LAST_CAMPAIGN_IS_Succes
0,2008-05-31,1.1,93.994,-36.4,58,1,0.612106,0.294045,0,2143,...,0,0.93575,261,1,-1,0,0,0.0,1.0,0.0
2,2008-05-31,1.1,93.994,-36.4,33,3,0.612106,0.554280,0,2,...,1,0.93575,76,1,-1,0,0,0.0,1.0,0.0
3,2008-05-31,1.1,93.994,-36.4,47,4,0.612106,0.554280,0,1506,...,0,0.93575,92,1,-1,0,0,0.0,1.0,0.0
4,2008-05-31,1.1,93.994,-36.4,33,4,0.276099,0.554280,0,1,...,0,0.93575,198,1,-1,0,0,0.0,1.0,0.0
5,2008-05-31,1.1,93.994,-36.4,35,1,0.612106,0.294045,0,231,...,0,0.93575,139,1,-1,0,0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45131,2010-10-31,-1.1,94.601,-49.5,47,1,0.612106,0.294045,0,0,...,0,0.93575,508,1,-1,0,1,0.0,1.0,0.0
45132,2010-10-31,-1.1,94.601,-49.5,61,5,0.612106,0.554280,0,1058,...,0,0.93575,277,1,92,5,0,0.0,0.0,1.0
45133,2010-10-31,-1.1,94.601,-49.5,24,11,0.276099,0.554280,0,822,...,0,0.93575,184,1,91,2,1,1.0,0.0,0.0
45134,2010-10-31,-1.1,94.601,-49.5,70,5,0.612106,0.554280,0,0,...,0,0.93575,258,1,92,5,1,0.0,0.0,1.0


In [174]:
get_missing_percent(fs1)

###
###
### Pourcentage de valeurs manquantes par colonnes 
###
###
###
LAST_CAMPAIGN_IS_Succes        0.0
LAST_CAMPAIGN_IS_No Contact    0.0
IDX_CONSUMER_PRICE             0.0
IDX_CONSUMER_CONFIDENCE        0.0
AGE                            0.0
JOB_TYPE                       0.0
STATUS                         0.0
EDUCATION                      0.0
HAS_DEFAULT                    0.0
BALANCE                        0.0
HAS_HOUSING_LOAN               0.0
HAS_PERSO_LOAN                 0.0
CONTACT                        0.0
DURATION_CONTACT               0.0
NB_CONTACT                     0.0
NB_DAY_LAST_CONTACT            0.0
NB_CONTACT_LAST_CAMPAIGN       0.0
SUBSCRIPTION                   0.0
LAST_CAMPAIGN_IS_Echec         0.0
EMPLOYMENT_VARIATION_RATE      0.0
dtype: float64


In [175]:
y = fs1['SUBSCRIPTION'].to_numpy()
y

array([0, 0, 0, ..., 1, 1, 0])

In [176]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
test_size= 0.20
seed = 7

X_train, X_test, Y_train, Y_test= train_test_split(fs1, y, test_size=test_size, random_state=seed)

In [177]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

In [178]:
np.where(np.isnan(X_train))

(array([], dtype=int64), array([], dtype=int64))

In [179]:
X_train.shape

(34664, 20)

In [180]:
results_c = []
names_c = []

for name, model in models:
    
    # define how to split off validation data ('kfold' how many folds)
    kfold = KFold(n_splits=10, random_state=seed)    
    
    # train the model
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')    
    results_c.append(cv_results)
    names_c.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.909069 (0.007332)
KNN: 0.884404 (0.005359)
CART: 1.000000 (0.000000)




NB: 0.999135 (0.000408)
