# **1- Connexion et importation des modules**

In [None]:
# @title Conexion à google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title Importation des différents modules

import numpy as np
%matplotlib inline
import pandas as pd
import seaborn as sns
import calendar
import matplotlib.pyplot as plt

import plotly.express as px
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# @title Chemins d'accés personalisés

## Estelle ##
##df = pd.read_csv("/content/drive/MyDrive/Oct_cda_bankmarketing/Estelle/bank.csv")

## Laurent ##
#df = pd.read_csv("/content/drive/MyDrive/Oct_cda_bankmarketing/df_bank_clean.csv", sep=';')

## Guillaume ##
df = pd.read_csv("/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/2_bank_clean.csv")

In [None]:
# @title Fonctions pour formater le texte lors des prints en gras ou souligné

def bold(text):
    return f"\033[1m{text}\033[0m"

def underline(text):
    return f"\033[4m{text}\033[0m"

# **2- Traitement des données**

In [None]:
# @title Suppression de la colonne 'Unnamed: 0'
df = df.drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,5,may,1042,1,-1,unknown,yes
1,55,services,married,secondary,no,2476,yes,no,5,may,579,1,-1,unknown,yes
2,54,admin.,married,tertiary,no,184,no,no,5,may,673,2,-1,unknown,yes
3,42,management,single,tertiary,no,0,yes,yes,5,may,562,2,-1,unknown,yes
4,60,retired,divorced,secondary,no,545,yes,no,6,may,1030,1,-1,unknown,yes


In [None]:
# @title Encodage des valeurs categorielles

# remplacement des valeurs dans pday
df["pdays"] = df["pdays"].replace(-1, 0)

# remplacement des valeurs 'yes' et 'no' respectivement par '0' et '1'
bin_cols = ["default", "housing", "loan", "deposit"]
df[bin_cols] = df[bin_cols].replace({'yes': 1, 'no': 0})

# remplacement des valeurs catégorielles ordinales
df["education"] = df["education"].replace(['primary', 'secondary', 'tertiary'], [0, 1, 2])

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,poutcome,deposit
0,59,admin.,married,1,0,2343,1,0,5,may,1042,1,0,unknown,1
1,55,services,married,1,0,2476,1,0,5,may,579,1,0,unknown,1
2,54,admin.,married,2,0,184,0,0,5,may,673,2,0,unknown,1
3,42,management,single,2,0,0,1,1,5,may,562,2,0,unknown,1
4,60,retired,divorced,1,0,545,1,0,6,may,1030,1,0,unknown,1


In [None]:
# @title Création du jeu de test et d'entrainement

X = df["deposit"]
Y = df.drop('deposit', axis = 1)

# split X_train / X_test
X_train, X_test, y_train, y_test = train_test_split(Y, X,
																										test_size=0.20,
																										random_state = 42)
print("La shape de notre jeu d'entrainement est : ",X_train.shape)

La shape de notre jeu d'entrainement est :  (6796, 14)


In [None]:
# @title Encodage des valeurs catégorielles

# Colonnes catégorielles
nom_cat_cols = ["job", "marital", "month", "poutcome"]

# - month ----------------------------
# j'encode ma variable que je stocke dans un df
encoded_month = pd.get_dummies(X_train['month'], prefix='month_', drop_first=True)
encoded_month_ = pd.get_dummies(X_test['month'], prefix='month_', drop_first=True)

# je concat mon df d'origine avec ma nouvelle variable encodée
X_train = pd.concat([X_train, encoded_month], axis=1)
X_test = pd.concat([X_test, encoded_month_], axis=1)

# je supprime ma varible initiale
X_train = X_train.drop("month", axis=1)
X_test = X_test.drop("month", axis=1)

# - marital ----------------------------
# X_train
encoded_marital = pd.get_dummies(X_train['marital'], prefix='marital_', drop_first=True)
X_train = pd.concat([X_train, encoded_marital], axis=1)
X_train = X_train.drop("marital", axis=1)

# X_test
encoded_marital_ = pd.get_dummies(X_test['marital'], prefix='marital_', drop_first=True)
X_test = pd.concat([X_test, encoded_marital_], axis=1)
X_test = X_test.drop("marital", axis=1)

# - job ----------------------------
# X_train
encoded_job = pd.get_dummies(X_train['job'], prefix='job_', drop_first=True)
X_train = pd.concat([X_train, encoded_job], axis=1)
X_train = X_train.drop("job", axis=1)

# X_test
encoded_job_ = pd.get_dummies(X_test['job'], prefix='job_', drop_first=True)
X_test = pd.concat([X_test, encoded_job_], axis=1)
X_test = X_test.drop("job", axis=1)

# - poutcome ----------------------------
# X_train
encoded_poutcome = pd.get_dummies(X_train['poutcome'], prefix='poutcome_', drop_first=True)
X_train = pd.concat([X_train, encoded_poutcome], axis=1)
X_train = X_train.drop("poutcome", axis=1)

# X_test
encoded_poutcome_ = pd.get_dummies(X_test['poutcome'], prefix='poutcome_', drop_first=True)
X_test = pd.concat([X_test, encoded_poutcome_], axis=1)
X_test = X_test.drop("poutcome", axis=1)

print("la shape de notre df initial est de :",df.shape)
print("la shape de notre df X_train est de    :", X_train.shape)
print("la shape de notre df X_test est de     :", X_test.shape)

print("la shape de notre df y_train est de    :", y_train.shape)
print("la shape de notre df y_test est de     :", y_test.shape)

la shape de notre df initial est de : (8496, 15)
la shape de notre df X_train est de    : (6796, 35)
la shape de notre df X_test est de     : (1700, 35)
la shape de notre df y_train est de    : (6796,)
la shape de notre df y_test est de     : (1700,)


In [None]:
# @title Information de notre jeu d'entrainement
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6796 entries, 6260 to 7270
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   age                 6796 non-null   int64
 1   education           6796 non-null   int64
 2   default             6796 non-null   int64
 3   balance             6796 non-null   int64
 4   housing             6796 non-null   int64
 5   loan                6796 non-null   int64
 6   day                 6796 non-null   int64
 7   duration            6796 non-null   int64
 8   campaign            6796 non-null   int64
 9   pdays               6796 non-null   int64
 10  month__aug          6796 non-null   bool 
 11  month__dec          6796 non-null   bool 
 12  month__feb          6796 non-null   bool 
 13  month__jan          6796 non-null   bool 
 14  month__jul          6796 non-null   bool 
 15  month__jun          6796 non-null   bool 
 16  month__mar          6796 non-null   bool 
 1

In [None]:
# Copie de X_test avant que les valeurs soient standardisées
X_test_copie = X_test.copy()

In [None]:
# @title Standardisation des données

cols = X_train.columns
sc = StandardScaler()
X_train[cols] = sc.fit_transform(X_train[cols])
X_test[cols] = sc.transform(X_test[cols])

X_train.head()

Unnamed: 0,age,education,default,balance,housing,loan,day,duration,campaign,pdays,...,job__housemaid,job__management,job__retired,job__self-employed,job__services,job__student,job__technician,job__unemployed,poutcome__success,poutcome__unknown
6260,0.696567,-1.83688,-0.125875,0.70661,-0.972709,-0.398668,1.625831,-0.626118,-0.024361,-0.490143,...,-0.15428,1.833576,-0.247745,-0.189672,-0.306038,-0.174578,-0.448713,-0.192568,-0.330711,0.526501
8472,-0.399257,-0.313953,-0.125875,-0.697525,-0.972709,-0.398668,-1.102178,-0.478759,-0.809005,-0.490143,...,-0.15428,-0.545382,-0.247745,-0.189672,3.267564,-0.174578,-0.448713,-0.192568,-0.330711,0.526501
2067,1.15316,-0.313953,-0.125875,1.177668,1.028057,2.508355,1.388613,-0.651379,-0.809005,-0.490143,...,-0.15428,-0.545382,-0.247745,-0.189672,-0.306038,-0.174578,2.228595,-0.192568,-0.330711,0.526501
5632,-0.855851,1.208975,-0.125875,0.768882,-0.972709,-0.398668,-0.390523,-0.575595,1.544927,-0.490143,...,-0.15428,1.833576,-0.247745,-0.189672,-0.306038,-0.174578,-0.448713,-0.192568,-0.330711,0.526501
1047,3.070853,-1.83688,-0.125875,-0.140089,-0.972709,-0.398668,0.914176,-0.205092,2.329571,-0.490143,...,-0.15428,-0.545382,4.036412,-0.189672,-0.306038,-0.174578,-0.448713,-0.192568,-0.330711,0.526501


In [None]:
# @title Tri des 4 DataFrame par ordre croissant de l'index sans changer les valeurs des index
X_train = X_train.sort_index(ascending=True)
X_test = X_test.sort_index(ascending=True)
y_train = y_train.sort_index(ascending=True)
y_test = y_test.sort_index(ascending=True)


In [None]:
# @title Fonction qui vérifie, si l'index à été modifé lors du tri.
# Si false => l'index na pas été modifié

def check_index_unchanged_after_sorting(*dataframes):
    """
    Vérifie si les index des DataFrames ont changés après le tri.

    Args:
    *dataframes: Une liste de DataFrames à vérifier.

    Returns:
    dict: Un dictionnaire indiquant pour chaque DataFrame si son index a été modifié après le tri.
    """
    results = {}

    for df in dataframes:
        original_index = df.index
        sorted_df = df.sort_index(ascending=True)
        index_changed = not original_index.equals(sorted_df.index)
        # Utiliser le nom de la variable comme clé dans le dictionnaire
        df_name = [name for name, frame in globals().items() if frame is df][0]
        results[df_name] = index_changed

    return results

# Utilisation de la fonction avec vos DataFrames
results = check_index_unchanged_after_sorting(X_train, X_test, y_train, y_test)

# Affichage des résultats
for df_name, index_changed in results.items():
    print(f"Changement d'index pour {df_name} :", index_changed)


Changement d'index pour X_train : False
Changement d'index pour X_test : False
Changement d'index pour y_train : False
Changement d'index pour y_test : False


# **3- Création du nouveau jeu de données**

---


* Création de 4 fichiers pour les utiliser dans le prochain notebook n°3 de machine learning:
  - 3_bank_X_train.csv => représentant X_train
  - 3_bank_X_test.csv => représentant X_test
  - 3_bank_y_test.csv => représentant y_test
  - 3_bank_y_train.csv => représentant y_train



---


In [None]:
# @title Création d'un nouveau fichier .csv à partir de **X_train** pour la suite du projet
X_train.to_csv(
    "/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/3_bank_X_train.csv",
    sep=','
)

In [None]:
# @title Création d'un nouveau fichier .csv à partir de **X_test** pour la suite du projet
X_test.to_csv(
    "/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/3_bank_X_test.csv",
    sep=','
)

In [None]:
# @title Création d'un nouveau fichier .csv à partir de **y_test** pour la suite du projet
y_test.to_csv(
    "/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/3_bank_y_test.csv",
    sep=','
)

In [None]:
# @title Création d'un nouveau fichier .csv à partir de **y_train** pour la suite du projet
y_train.to_csv(
    "/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/3_bank_y_train.csv",
    sep=','
)

In [None]:
# @title Création d'un nouveau fichier .csv à partir de **X_test_copie** pour la suite du projet
X_test_copie.to_csv(
    "/content/drive/MyDrive/DataScientest /Oct_cda_bankmarketing/Commun/3_bank_X_test_copie.csv",
    sep=','
)