# Data Processing

In [3]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle

# Buscar dados do arquivo CSV
csv_file = "data/01_raw/Abandono_clientes (11).csv"
df = pd.read_csv(csv_file)

# Remove colunas não utilizadas
df1 = df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])

# Separando as variaveis independentes e dependentes
y = df1['Exited']
X = df1.copy()
X = df1.drop(columns='Exited')

# Processamento das variáveis continuas e categóricas
x_cont = ['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']
x_cat = list(set(X)-set(x_cont))
x_dummies = X[x_cat]
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender']) 
x_final = pd.get_dummies(data=X, columns=['Geography'])

# Feature Engineering
x_final['Salary_per_Age'] = x_final['EstimatedSalary'] / x_final['Age']
x_final['CreditScore_per_Products'] = x_final['CreditScore'] / x_final['NumOfProducts']
x_final['CreditScore_per_Salary'] = x_final['CreditScore'] / x_final['EstimatedSalary']
new_cont = ['Salary_per_Age', 'CreditScore_per_Products', 'CreditScore_per_Salary']
for var_cont in new_cont:
    x_cont.append(var_cont)

# Normalização dos dados
# Como foi adotado RandomForest não seria necessário normalizar - mantive para testar
scaler = MinMaxScaler()
x_final[x_cont] = scaler.fit_transform(x_final[x_cont])

# Armazena dados e scaler
df_to_save = x_final.copy()
df_to_save['Exited'] = y
df_to_save.to_csv('data/03_primary/data_input_model.csv', index=False)
with open('data/06_models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle

def load_data():
    # Buscar dados do arquivo CSV
    csv_file = "data/01_raw/Abandono_clientes (11).csv"
    df = pd.read_csv(csv_file)

    # Remove colunas não utilizadas
    df1 = df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])

    # Separando as variaveis independentes e dependentes
    y = df1['Exited']
    X = df1.copy()
    X = df1.drop(columns='Exited')

    return X, y

def process_data():
    # Processamento das variáveis continuas e categóricas
    x_cont = ['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']
    x_cat = list(set(X)-set(x_cont))
    x_dummies = X[x_cat]
    le = LabelEncoder()
    X['Gender'] = le.fit_transform(X['Gender']) 
    x_final = pd.get_dummies(data=X, columns=['Geography'])

    return x_cont, x_final

def feature_engineering():
    # Feature Engineering
    x_final = x_final.copy()
    x_final['Salary_per_Age'] = x_final['EstimatedSalary'] / x_final['Age']
    x_final['CreditScore_per_Products'] = x_final['CreditScore'] / x_final['NumOfProducts']
    x_final['CreditScore_per_Salary'] = x_final['CreditScore'] / x_final['EstimatedSalary']
    new_cont = ['Salary_per_Age', 'CreditScore_per_Products', 'CreditScore_per_Salary']
    for var_cont in new_cont:
        x_cont.append(var_cont)

    return x_final

def normalize():
    # Normalização dos dados
    # Como foi adotado RandomForest não seria necessário normalizar - mantive para testar
    scaler = MinMaxScaler()
    x_final[x_cont] = scaler.fit_transform(x_final[x_cont])

    return x_final

def save():
    # Armazena dados e scaler
    df_to_save = x_final.copy()
    df_to_save['Exited'] = y
    df_to_save.to_csv('data/03_primary/data_input_model.csv', index=False)
    with open('data/06_models/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

# Data Science

In [1]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle

# Buscar dados do arquivo CSV
csv_file = "data/01_raw/Abandono_teste (11).csv"
df = pd.read_csv(csv_file, sep=";")

# Carrega scaler no arquivo pickle
with open('data/06_models/scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Carrega modelo no arquivo pickle
with open('data/06_models/rf_model.pkl', 'rb') as file:
    rf_model = pickle.load(file)

# Remove colunas que não serão utilizadas
df1 = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# Separação das variáveis
x_cont = ['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']
x_cat = list(set(df1)-set(x_cont))

# Processamento das variáveis
x_dummies = df1[x_cat]
le = LabelEncoder()
df1['Gender'] = le.fit_transform(df1['Gender']) 
df_final = pd.get_dummies(data=df1, columns=['Geography'])

# Feature Engineering
df_final['Salary_per_Age'] = df_final['EstimatedSalary'] / df_final['Age']
df_final['CreditScore_per_Products'] = df_final['CreditScore'] / df_final['NumOfProducts']
df_final['CreditScore_per_Salary'] = df_final['CreditScore'] / df_final['EstimatedSalary']

# Acrescenta novas variáveis na lista
new_cont = ['Salary_per_Age', 'CreditScore_per_Products', 'CreditScore_per_Salary']
for var_cont in new_cont:
    x_cont.append(var_cont)

# Normalização dos dados
df_final[x_cont] = scaler.fit_transform(df_final[x_cont])

# Realiza predição
df['predictedValues'] = rf_model.predict(df_final)

# Saida no formato solicitado no Case
df_out = df[['RowNumber', 'predictedValues']]

# Armazena resultado em CSV
df_out.to_csv('data/08_reporting/predicted_data.csv', index=False)


df_out


Unnamed: 0,RowNumber,predictedValues
0,10001,0
1,10002,0
2,10003,0
3,10004,0
4,10005,0
...,...,...
995,10996,0
996,10997,1
997,10998,0
998,10999,0


In [None]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle

def load_data():
    # Buscar dados do arquivo CSV
    csv_file = "data/01_raw/Abandono_teste (11).csv"
    df = pd.read_csv(csv_file, sep=";")

    # Carrega scaler no arquivo pickle
    with open('data/06_models/scaler.pkl', 'rb') as file:
        scaler = pickle.load(file)

    # Carrega modelo no arquivo pickle
    with open('data/06_models/rf_model.pkl', 'rb') as file:
        rf_model = pickle.load(file)

    # Remove colunas que não serão utilizadas
    df1 = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

def processing_data():
    # Separação das variáveis
    x_cont = ['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']
    x_cat = list(set(df1)-set(x_cont))

    # Processamento das variáveis
    x_dummies = df1[x_cat]
    le = LabelEncoder()
    df1['Gender'] = le.fit_transform(df1['Gender']) 
    df_final = pd.get_dummies(data=df1, columns=['Geography'])

    # Feature Engineering
    df_final['Salary_per_Age'] = df_final['EstimatedSalary'] / df_final['Age']
    df_final['CreditScore_per_Products'] = df_final['CreditScore'] / df_final['NumOfProducts']
    df_final['CreditScore_per_Salary'] = df_final['CreditScore'] / df_final['EstimatedSalary']

    # Acrescenta novas variáveis na lista
    new_cont = ['Salary_per_Age', 'CreditScore_per_Products', 'CreditScore_per_Salary']
    for var_cont in new_cont:
        x_cont.append(var_cont)

    # Normalização dos dados
    df_final[x_cont] = scaler.fit_transform(df_final[x_cont])

def prediction():
    # Realiza predição
    df['predictedValues'] = rf_model.predict(df_final)

    # Saida no formato solicitado no Case
    df_out = df[['RowNumber', 'predictedValues']]

    # Armazena resultado em CSV
    df_out.to_csv('data/08_reporting/predicted_data.csv', index=False)


df_out
