In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Carregando tratando a base de dados

In [32]:
df = pd.read_csv("loan_default.csv")

In [33]:
# Definindo variáveis qualitativas e quantitativas
categorical_features = ['loan_limit', 'approv_in_adv', 'Gender', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization','interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co_applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']
numeric_features = ['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1']

In [34]:
# Crie o pipeline para pré-processamento
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

In [35]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])

## Treinando o modelo

In [37]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KMeans(n_clusters=3, random_state=42) )
])

In [38]:
grupo = model.fit_predict(df)

print(grupo)

[1 1 0 ... 0 0 0]


In [39]:
joblib.dump(model, 'propensao_inadimplencia_clusterization.joblib')

['propensao_inadimplencia_clusterization.joblib']

## Criano Persona e verificando média de risco

In [40]:
# Adicionar uma nova coluna 'Grupo' ao dataframe original com os rótulos dos clusters
df['Grupo'] = grupo

In [41]:
# persona: Baixo risco de fraude
df[df['Grupo'] == 0]['Status'].mean()*100

16.988319654911628

In [42]:
# persona: médio risco de fraude
df[df['Grupo'] == 1]['Status'].mean()*100

36.14762522990371

In [43]:
# persona: alto risco de fraude
df[df['Grupo'] == 2]['Status'].mean()*100

9.852658552210123