#### Feature Engineering 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

#### Categorical features

In [2]:
df = pd.read_csv('Churn_Banking_Modeling.csv')

for col in df.columns:
    if col.startswith('Flag'):
        df[col] = df[col].astype('object')


numeric_variables_count = len(df.select_dtypes(include=['number']).columns)
categorical_variables_count = len(df.select_dtypes(include=['object']).columns)

print("number of numeric features",numeric_variables_count)
print("number of categorical features",categorical_variables_count)

print(df.describe(include=['object']))

FileNotFoundError: [Errno 2] No such file or directory: 'Churn_Banking_Modeling.csv'

###### The features 'Professione' has two same modalities, so we merged them.

In [None]:
df_new=df
df_new['Professione'] = df_new['Professione'].apply(lambda x: 'Ufficiale/Sottoufficiale' if x == 'Ufficiale/Sottufficiale' else 
                                                                  'Libero professionista/Titolare impresa' if x == 'Libero professionista/Titolare di impresa' else 
                                                                  x)
df_new['Professione'].unique()

array(['Impiegato', 'Altro/Nessuno', 'Quadro', 'Operaio', nan,
       'Ufficiale/Sottoufficiale',
       'Libero professionista/Titolare impresa', 'Dirigente',
       'Commercialista', 'Rappresentante/Agente', 'Ingegnere/Architetto',
       'Avvocato', 'Medico/Farmacista', 'Autotrasportatore',
       'Docente/Insegnante', 'Commesso/Ausiliario', 'Notaio'],
      dtype=object)

###### The features Provincia_Residenza and Provincia_Domicilio have different modalities referred to the same place, for example, for Forlì-Cesena exist FO and FC, so we merged them. 

In [None]:
domicilio_counts = df_new['Provincia_Domicilio'].value_counts()
residenza_counts = df_new['Provincia_Residenza'].value_counts()

print(domicilio_counts[domicilio_counts<100], residenza_counts[residenza_counts<100])

frequenze_PU = df_new[df_new['Provincia_Domicilio'] == 'PU'].shape[0]
print("Absolute frequencies of 'PU':", frequenze_PU)

frequenze_FC = df_new[df_new['Provincia_Domicilio'] == 'FC'].shape[0]
print("Absolute frequencies of 'FC':", frequenze_FC)

Provincia_Domicilio
OG    51
FO    29
PS    26
8N     1
BE     1
Name: count, dtype: int64 Provincia_Residenza
OG    69
PS     2
FO     1
Name: count, dtype: int64
Absolute frequencies of 'PU': 2412
Absolute frequencies of 'FC': 2099


In [None]:
df_new['Provincia_Residenza'] = df_new['Provincia_Residenza'].replace('FO', 'FC')
df_new['Provincia_Domicilio'] = df_new['Provincia_Domicilio'].replace('FO', 'FC')

df_new['Provincia_Residenza'] = df_new['Provincia_Residenza'].replace('PS', 'PU')
df_new['Provincia_Domicilio'] = df_new['Provincia_Domicilio'].replace('PS', 'PU')

df_new['Provincia_Domicilio'] = df_new['Provincia_Domicilio'].replace('8N', np.nan)
df_new['Provincia_Domicilio'] = df_new['Provincia_Domicilio'].replace('BE', np.nan)

domicilio_counts = df_new['Provincia_Domicilio'].value_counts()
residenza_counts = df_new['Provincia_Residenza'].value_counts()

print(domicilio_counts[domicilio_counts<100], residenza_counts[residenza_counts<100])

Provincia_Domicilio
OG    51
Name: count, dtype: int64 Provincia_Residenza
OG    69
Name: count, dtype: int64


###### Insted of changing all categorical features in dummies, there are some ordinal categorical features that we can encode.

In [None]:
mapping = {
    'CAUTO': 1,
    'PRUDENTE': 2,
    'BILANCIATO': 3,
    'DINAMICO':4,
    'ND':0
}
df_new['Profilo_MIFID'] = df_new['Profilo_MIFID'].replace(mapping)

mapping2 = {
    'Bassissimo (<1200)': 1,
    'Basso (tra 1200 e 1500)': 2,
    'Medio (tra 1500 e 1800)': 3,
    'Alto (tra 1800 e 2500)':4,
    'Altissimo (>2500)':5
}

df_new['Imp_Reddito'] = df_new['Imp_Reddito'].replace(mapping2)

#### Numerical features

In [None]:
df_new.describe(include=['int64','float64'])

Unnamed: 0,Id_Cliente,Imp_Valore_del_Cliente,Eta,Anno_Apertura_primo_Conto,Imp_Reddito,Profilo_MIFID,Imp_Canone_Pricing_cc,Imp_Bonifici_vs_Competitors,Imp_Bonifici_vs_No_Competitors,Num_Utenze_in_essere,...,Num_attivit�_trading,Var_Numero_Utenze,Iscrizione_programma_Loyalty,Num_Punti_Programma_Loyalty,Imp_Liquidit�_Attuale,Imp_Gestito_attuale,Imp_Amministrato_attuale,Imp_Liquidit�_Attuale_6m,Imp_Gestito_attuale_6m,Imp_Amministrato_attuale_6m
count,377369.0,377369.0,377362.0,377369.0,157476.0,328112.0,349973.0,88714.0,200690.0,209937.0,...,86931.0,218355.0,377369.0,127239.0,377364.0,111188.0,187634.0,377356.0,110900.0,186298.0
mean,277680.735847,644.662654,46.03989,2004.80575,3.20934,2.550681,0.903291,3183.13,3352.734,4.454517,...,15.143344,0.198044,0.337174,1045.617303,15134.78,67704.61,76144.77,15320.84,65544.81,72338.62
std,160279.531097,2928.725757,13.146689,3.288509,1.369838,1.451032,1.825685,14231.11,15119.87,3.636854,...,78.764646,1.071767,0.472745,3515.585204,52046.23,194091.4,266963.8,52012.75,178093.0,263748.1
min,2.0,0.0,3.0,1994.0,1.0,0.0,0.0,0.01,0.01,1.0,...,1.0,-26.0,0.0,-50.0,-2998609.0,0.0,0.0,-2990061.0,0.0,0.0
25%,138834.0,46.87,37.0,2002.0,2.0,2.0,0.0,157.5825,350.0,2.0,...,1.0,0.0,0.0,150.0,629.21,7880.375,4966.84,631.3775,7808.85,4937.938
50%,277965.0,97.27,43.0,2005.0,3.0,3.0,0.0,479.0,805.8555,3.0,...,2.0,0.0,0.0,495.0,3850.86,23293.58,20841.53,3788.385,22849.53,20161.69
75%,416378.0,370.32,54.0,2008.0,4.0,4.0,0.325,1500.0,2047.996,6.0,...,6.0,0.0,1.0,1065.0,14178.72,64695.45,67668.76,13977.86,62973.51,64348.4
max,555150.0,377934.87,132.0,2010.0,5.0,4.0,5.95,1100000.0,1130700.0,40.0,...,6812.0,25.0,1.0,427530.0,16860200.0,17363780.0,48087640.0,14866190.0,18381480.0,57158640.0


#### Dropping of some features


In [None]:
df_new = df_new.drop(['Imp_Gestito_attuale_6m', 'Imp_Liquidit�_Attuale_6m', 'Imp_Amministrato_attuale_6m', 'Id_Cliente'], axis =1)
df_new.head()

Unnamed: 0,Flag_Richiesta_Estinzione_cc,Imp_Valore_del_Cliente,Flag_Apertura_Conto_Online,Flag_Possesso_piu_Conti,Eta,Provincia_Domicilio,Provincia_Residenza,Anno_Apertura_primo_Conto,Professione,Imp_Reddito,...,Flag_Contatto_Call_Center,Flag_variazione_Accredito_Stipendio,Num_Punti_Programma_Loyalty,Imp_Liquidit�_Attuale,Imp_Gestito_attuale,Imp_Amministrato_attuale,Flag_Trasferimento_Titoli_Out,Flag_Rifiuto_Carte,Flag_Rifiuto_Prestiti,Flag_Disattivazione_RID
0,no,65.03,0,0,38.0,BO,BO,2004.0,Impiegato,3.0,...,1.0,0.0,,1634.57,2978.4,2980.92,0,0,0,0
1,no,138.88,0,0,45.0,SA,SA,2000.0,Impiegato,4.0,...,1.0,-1.0,325.0,11918.26,,34916.15,0,0,0,0
2,no,546.54,0,0,61.0,VT,VT,2005.0,Altro/Nessuno,4.0,...,1.0,0.0,460.0,2671.95,,232776.62,1,0,0,0
3,no,68.69,0,0,33.0,MI,MI,2010.0,Impiegato,2.0,...,0.0,-1.0,,19211.31,15013.53,15017.53,0,0,0,0
4,no,2417.05,0,0,36.0,MI,MI,2001.0,Quadro,,...,1.0,,3650.0,1526.38,,,0,0,0,0


#### Creation of the new dataset with the changes applied

In [None]:
#df_model.to_csv("Churn_clean.csv", index=False)
df_new.to_csv("Churn_clean_competition.csv", index=False)