In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import modulos

In [2]:
df = pd.read_csv("BankData.csv")
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [3]:
#obtenemos tipos de datos de columnas de dataframe
categoric_vars, discrete_vars , continues_vars = modulos.getColumnsDataTypes(df=df)

## 2) Feature Engineering

### 2.1) Configuración de Variables:

In [4]:
y = 'Attrition_Flag'
continues_vars.remove('CLIENTNUM')
categoric_vars.remove(y)

### 2.2) Imputación de Variables

##### 2.2.1) Variables Continuas

In [5]:
df[continues_vars].isnull().mean()

Customer_Age             0.0
Months_on_book           0.0
Credit_Limit             0.0
Total_Revolving_Bal      0.0
Avg_Open_To_Buy          0.0
Total_Amt_Chng_Q4_Q1     0.0
Total_Trans_Amt          0.0
Total_Trans_Ct           0.0
Total_Ct_Chng_Q4_Q1      0.0
Avg_Utilization_Ratio    0.0
dtype: float64

##### 2.2.2) Variables Discretas

In [6]:
df[discrete_vars].isnull().mean()

Dependent_count             0.0
Total_Relationship_Count    0.0
Months_Inactive_12_mon      0.0
Contacts_Count_12_mon       0.0
dtype: float64

##### 2.2.3) Variables Categoricas

In [7]:
df[categoric_vars].isnull().mean()

Gender             0.0
Education_Level    0.0
Marital_Status     0.0
Income_Category    0.0
Card_Category      0.0
dtype: float64

### 2.3) Codificación de Variables Categóricas

In [8]:
educ_level_map = df['Education_Level'].value_counts().to_dict()
educ_level_map

{'Graduate': 3128,
 'High School': 2013,
 'Unknown': 1519,
 'Uneducated': 1487,
 'College': 1013,
 'Post-Graduate': 516,
 'Doctorate': 451}

In [9]:
df['Education_Level'] = df['Education_Level'].map(educ_level_map)
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,2013,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,3128,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,3128,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,2013,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,1487,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [10]:
from sklearn.pipeline import Pipeline
import preprocessors as pp

def instanciatePipeline(df, y):
    categoric_vars, discrete_vars , continues_vars = modulos.getColumnsDataTypes(df=df)
    
    categoric_vars.remove(y)
    bankChurner_Pipeline = Pipeline(steps=[
        ('categorical-encoder',
            pp.categoricalEncoderOperator(varNames=categoric_vars)),

        ('classifier',
            RandomForestClassifier(n_estimators=20, max_depth=100))
    ])

    return bankChurner_Pipeline

In [11]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("BankData.csv")

X = df.drop(['Attrition_Flag', 'CLIENTNUM'], axis=1)
y = df['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2022)

dfSalida = instanciatePipeline(df, 'Attrition_Flag').fit_transform(X_train, y_train)

dfSalida['Attrition_Flag'] = pd.get_dummies(y, drop_first=True)



In [12]:
dfSalida.to_csv("FE_DF_Salida.csv")