## Pipeline

In [129]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.model_selection import train_test_split
from pycaret.classification import *

import pickle

In [114]:
class NullImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fill_values = {}

    def fit(self, X, y=None):
        for column in X.columns:
            if X[column].isnull().any():
                if X[column].dtype == np.number:
                    self.fill_values[column] = X[column].mean()
                else:
                    self.fill_values[column] = X[column].mode()[0]
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for column, value in self.fill_values.items():
            X[column].fillna(value, inplace=True)
        return X

In [115]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, method='remove'):
        self.method = method
        self.outliers_indices = []

    def fit(self, X, y=None):
        for column in X.select_dtypes(include=[np.number]).columns:
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.outliers_indices.extend(X[(X[column] < lower_bound) | (X[column] > upper_bound)].index)
        return self

    def transform(self, X, y=None):
        if self.method == 'remove':
            return X.drop(index=self.outliers_indices)
        elif self.method == 'cap':
            X = X.copy()
            for column in X.select_dtypes(include=[np.number]).columns:
                Q1 = X[column].quantile(0.25)
                Q3 = X[column].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                X[column] = np.where(X[column] < lower_bound, lower_bound, X[column])
                X[column] = np.where(X[column] > upper_bound, upper_bound, X[column])
            return X

In [116]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, model=None, num_features=5):
        self.model = model or RandomForestClassifier()
        self.num_features = num_features
        self.selected_features = []

    def fit(self, X, y=None):
        self.model.fit(X, y)
        importances = pd.Series(self.model.feature_importances_, index=X.columns)
        self.selected_features = importances.nlargest(self.num_features).index
        return self

    def transform(self, X, y=None):
        return X[self.selected_features]

In [117]:
class PCAReducer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=5):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components)

    def fit(self, X, y=None):
        self.pca.fit(X)
        return self

    def transform(self, X, y=None):
        return pd.DataFrame(self.pca.transform(X))

In [118]:
class DummiesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return pd.get_dummies(X, drop_first=True)

In [119]:
def preprocessamento():
    pipeline = Pipeline(steps=[
        ('imputer', NullImputer()),                 # Substituição de nulos
        ('pca', PCAReducer(n_components=5)),        # Redução de dimensionalidade (PCA)
        ('dummies', DummiesTransformer())           # Criação de dummies (inclui 'posse_de_veiculo')
    ])
    return pipeline

In [120]:
df = pd.read_csv('credit_scoring.csv').drop(columns='Unnamed: 0')
df['data_ref'] = pd.to_datetime(df['data_ref'])
df['mes'] = df['data_ref'].dt.month_name()

df = df[df['mes'].isin(['December', 'November', 'October'])]
df = df.drop(columns=['data_ref', 'index'])
df = df.drop_duplicates()

df.head()

Unnamed: 0,sexo,posse_de_veiculo,posse_de_imovel,qtd_filhos,tipo_renda,educacao,estado_civil,tipo_residencia,idade,tempo_emprego,qt_pessoas_residencia,renda,mau,mes
450000,F,N,S,0,Empresário,Médio,Casado,Casa,33,0.512329,2.0,4316.53,True,October
450001,F,N,N,0,Assalariado,Médio,Casado,Casa,50,12.460274,2.0,34332.37,False,October
450002,M,N,S,0,Assalariado,Médio,Casado,Casa,59,8.219178,2.0,31638.0,False,October
450003,M,S,S,1,Assalariado,Médio,Casado,Casa,28,6.164384,3.0,61866.23,False,October
450004,F,N,S,0,Assalariado,Médio,Casado,Casa,34,13.523288,2.0,32342.36,False,October


In [121]:
df_ML = pd.get_dummies(df, drop_first=True)
X = df_ML.drop('mau', axis=1)
y = df_ML['mau']
df_ML.head(1)

Unnamed: 0,qtd_filhos,idade,tempo_emprego,qt_pessoas_residencia,renda,mau,sexo_M,posse_de_veiculo_S,posse_de_imovel_S,tipo_renda_Bolsista,tipo_renda_Empresário,tipo_renda_Pensionista,tipo_renda_Servidor público,educacao_Médio,educacao_Pós graduação,educacao_Superior completo,educacao_Superior incompleto,estado_civil_Separado,estado_civil_Solteiro,estado_civil_União,estado_civil_Viúvo,tipo_residencia_Casa,tipo_residencia_Com os pais,tipo_residencia_Comunitário,tipo_residencia_Estúdio,tipo_residencia_Governamental,mes_November,mes_October
450000,0,33,0.512329,2.0,4316.53,True,False,False,True,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(max_depth=3, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Valutazione del modello
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[14054     0]
 [ 2791     0]]

Classification Report:
              precision    recall  f1-score   support

       False       0.83      1.00      0.91     14054
        True       0.00      0.00      0.00      2791

    accuracy                           0.83     16845
   macro avg       0.42      0.50      0.45     16845
weighted avg       0.70      0.83      0.76     16845


Accuracy Score:
0.8343128524784803


---

In [123]:
# Exemplo de como usar o pipeline
pipeline = preprocessamento()
X_preprocessed = pipeline.fit_transform(X)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(max_depth=3, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Valutazione del modello
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[14054     0]
 [ 2791     0]]

Classification Report:
              precision    recall  f1-score   support

       False       0.83      1.00      0.91     14054
        True       0.00      0.00      0.00      2791

    accuracy                           0.83     16845
   macro avg       0.42      0.50      0.45     16845
weighted avg       0.70      0.83      0.76     16845


Accuracy Score:
0.8343128524784803


In [130]:
"""with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(clf, file)"""

In [139]:
df_ML = pd.get_dummies(df, drop_first=True)
X = df_ML.drop('mau', axis=1)
y = df_ML['mau']
df_ML.head(1)

# Exemplo de como usar o pipeline
pipeline = preprocessamento()
X_preprocessed = pipeline.fit_transform(X)

In [140]:
with open('decision_tree_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

predictions = loaded_model.predict(X_preprocessed)
print(predictions)

[False False False ... False False False]


In [142]:
from collections import Counter
Counter(predictions)

Counter({False: 56149})