In [1]:
# %load utilities
#!/usr/bin/env python

# In[3]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from scipy.stats import skewtest
from sklearn import metrics

warnings.filterwarnings("ignore")

def romanToInt(i):   
    roman = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000,'IV':4,'IX':9,'XL':40,'XC':90,'CD':400,'CM':900}
    j = 0
    num = 0
    while j < len(i):
        if j+1<len(i) and i[j:j+2] in roman:
            num+=roman[i[j:j+2]]
            j+=2
        else:

            num+=roman[i[j]]
            j+=1
    return num

def skew_df(df):
    skewness, p_value = skewtest(df)
    dskew=pd.DataFrame(np.round(np.vstack((skewness.T,p_value.T)),2),columns=df.columns,
                    index=['skewness', 'p_value'])
    return(dskew)

def plot_roc_curve(fpr,tpr):
    plt.plot(fpr,tpr)
    plt.plot([0,1],[0,1],'r')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC - TPR vs FPR')
    
def printCustomMetrics(y_test, y_pred):
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred).round(2))
    print("Precision:", metrics.precision_score(y_test, y_pred).round(2))
    print("Recall:", metrics.recall_score(y_test, y_pred).round(2))
    print("f1:", metrics.f1_score(y_test, y_pred).round(2))


In [2]:
#Import Dataset
df = pd.read_csv('loan_default_prediction.csv')
print(df.shape)

(87500, 30)


In [3]:
#drop
vdrop=['ID','Validation','Designation','Debt_to_Income','Postal_Code','Deprecatory_Records',\
            'Inquiries','Gross_Collection','Sub_GGGrade','Total_Unpaid_CL','File_Status','Claim_Type','Due_Fee']
df=df.drop(vdrop,axis=1)
df.shape

(87500, 17)

In [4]:
#Conversão dos anos de experiência para numérico
df['Experience']=df['Experience'].apply(lambda i: 10 if i[0:1]=='>' else 1 if i[0:1]=='<' else int(i[0:1]))
#Conversão da duração para numérico
df['Duration']=df['Duration'].apply(lambda i : i.replace(' years','years')).astype(str)
#Conversão da GGGrade valor ordinal para numérico
df['GGGrade']=df['GGGrade'].apply(romanToInt).astype(int)
#criacao de debt to income
#calcular o total da divida e o rendimento anual. uma espécie de "taxa de esforço"
df['debt_to_income']=df['Unpaid_Amount']/df['Yearly_Income']
#ver resultado
#df.head()

In [5]:
#eliminar observações com pelo menos uma feature sem valores
df=df.dropna()
#drop duplicates
df.drop_duplicates()
df.shape

(77376, 18)

In [6]:
#df.describe()
df.head(5)

Unnamed: 0,Asst_Reg,GGGrade,Experience,Yearly_Income,Home_Status,Unpaid_2_years,Already_Defaulted,Lend_Amount,Interest_Charged,Usage_Rate,Present_Balance,State,Account_Open,Duration,Unpaid_Amount,Reason,Default,debt_to_income
0,421802,2,10,633600.0,MORTGAGE,0,0,42023.25,15.39,88.924,607161.9,California,17,3years,31216.05,debt consolidation,0,0.049268
1,3964312,4,7,85483.2,RENT,0,0,38133.0,9.94,102.856,269234.06,NC,15,5years,11660.49,debt consolidation,0,0.136407
2,4247560,3,1,79200.0,RENT,0,0,17100.0,22.35,60.372,22476.53,Florida,7,5years,5637.87,major purchase,0,0.071185
3,197179,3,1,61600.0,RENT,0,0,5130.0,10.36,116.272,15242.09,NewJersey,9,3years,15607.17,major purchase,1,0.253363
4,4646684,5,2,68053.92,RENT,0,0,19665.0,13.68,127.28,65433.94,LA,10,5years,27472.86,debt consolidation,0,0.403693


In [7]:
v_num_cont=['Asst_Reg','Experience','Yearly_Income','Lend_Amount','Interest_Charged','Usage_Rate',
            'Present_Balance','Unpaid_Amount','debt_to_income']
v_num_disc=['Unpaid_2_years','Already_Defaulted','Account_Open']
v_cat_ord=['Home_Status','State','Reason','Duration']#,'GGGrade']

In [8]:
from scipy import stats
display(df.shape)
#remover observações com home status 'none' e 'other'
df=df[(df['Home_Status']!='OTHER')&(df['Home_Status']!='NONE')]
#remover linhas com outliers, definidos como mais que 3 desvios-padrão acima/abaixo da média
df=df[(np.abs(stats.zscore(df[v_num_cont])) < 3).all(axis=1)]
df.shape


(77376, 18)

(74542, 18)

# Undersampling

In [9]:
#ver quantas observações têm default=1
display((df['Default']==1).sum())
#sample de 5000 obs com default =1
defaulted = df[df['Default']==1].sample(n=5000, random_state=101)
#sample de 5000 obs com default =0
notdefault = df[df['Default']==0].sample(n=5000, random_state=101)
#agregar as observações
df = pd.concat([defaulted,notdefault],axis=0)
#sort aleatório das obs
df = df.sample(frac=1).reset_index(drop=True)
df.shape

14184

(10000, 18)

In [10]:
df[v_num_disc].describe()

Unnamed: 0,Unpaid_2_years,Already_Defaulted,Account_Open
count,10000.0,10000.0,10000.0
mean,0.3159,0.0053,12.5269
std,0.855792,0.076632,5.129104
min,0.0,0.0,2.0
25%,0.0,0.0,9.0
50%,0.0,0.0,12.0
75%,0.0,0.0,15.0
max,16.0,2.0,51.0


# Train test split

In [11]:
# Split the data into features (X) and labels (y)
X = df[v_num_cont+v_num_disc+v_cat_ord]
y = df['Default']

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Pipelines: transformação de variáveis

In [12]:
#verificar assimetria das features numéricas
dskew=skew_df(X_train[v_num_cont+v_num_disc])
#mostrar output
display(dskew)
#ver lista de features assimétricas, pvalue < 5%
v_skew=list(dskew.columns[dskew.loc['p_value']<0.05])
#lista de simétricas são as restantes
v_sym=list(set(X_train[v_num_cont+v_num_disc].columns) - set(v_skew))
#check
v_skew

Unnamed: 0,Asst_Reg,Experience,Yearly_Income,Lend_Amount,Interest_Charged,Usage_Rate,Present_Balance,Unpaid_Amount,debt_to_income,Unpaid_2_years,Already_Defaulted,Account_Open
skewness,13.72,-5.77,38.53,23.17,-0.45,-5.86,35.08,42.59,33.6,76.21,108.02,36.18
p_value,0.0,0.0,0.0,0.0,0.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0


['Asst_Reg',
 'Experience',
 'Yearly_Income',
 'Lend_Amount',
 'Usage_Rate',
 'Present_Balance',
 'Unpaid_Amount',
 'debt_to_income',
 'Unpaid_2_years',
 'Already_Defaulted',
 'Account_Open']

In [13]:
for i in v_cat_ord:
    c=df[i].value_counts()/df.shape[0]
    #list(c[c>.05].index)
    print('Features com predominância em mais de 5% das observações:'+str(list(c[c>=.05].index)))
    print('Features com predominância em menos de 5% das observações:'+str(list(c[c<.05].index)))


Features com predominância em mais de 5% das observações:['MORTGAGE', 'RENT', 'OWN']
Features com predominância em menos de 5% das observações:[]
Features com predominância em mais de 5% das observações:['California', 'TX', 'Newyork', 'Florida']
Features com predominância em menos de 5% das observações:['IL', 'NewJersey', 'PA', 'GA', 'Ohio', 'NC', 'MI', 'VA', 'Maryland', 'AZ', 'CO', 'WA', 'MA', 'MN', 'TN', 'MO', 'NV', 'IN', 'OR', 'SC', 'AL', 'WI', 'CT', 'LA', 'KS', 'AR', 'OK', 'KY', 'UT', 'HI', 'RI', 'NM', 'WV', 'DC', 'NH', 'MS', 'MT', 'AK', 'DE', 'WY', 'SD', 'NE', 'VT', 'ND', 'ME']
Features com predominância em mais de 5% das observações:['debt  consolidation', 'credit  card', 'home  improvement']
Features com predominância em menos de 5% das observações:['other', 'major  purchase', 'medical', 'car', 'small  business', 'moving', 'vacation', 'house', 'wedding', 'RENTwable  energy']
Features com predominância em mais de 5% das observações:['3years', '5years']
Features com predominância 

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Define the transformations to apply to the column
transformer = ColumnTransformer([
    ('yeoj', PowerTransformer(), v_skew), #aplico transformação que corrige assimetria às assimétricas
    ('std', StandardScaler(), v_sym),     #aplico transformação às simétricas (sub média e dividir desvio padrao)
    ('oneh', OneHotEncoder(min_frequency=0.05,handle_unknown='ignore', sparse_output=False), v_cat_ord)
])

# Transform the data
pfit = transformer.fit(X_train)

#Create dataframe with transformation
categories= pfit.transformers_[2][1].categories_
categories_out=pfit.transformers_[2][1].infrequent_categories_

v_onehot_drop=list(np.concatenate([np.concatenate([categories[i][np.isin(categories[i], categories_out[i], invert=True)]],axis=0) 
                        for i in range(0,len(categories))],axis=0))


v_onehot=list(np.concatenate([(np.concatenate((np.array(j),
                       (np.array(['Other_Cat'+str(k)]) if categories_out[k] is not None else np.array([]))),axis=0)) 
                         for k,j in enumerate(
                         [np.concatenate([categories[i][np.isin(categories[i], categories_out[i], invert=True)]],axis=0) 
                        for i in range(0,len(categories))]
                         )]))

#Transform train and test X
X_train_transf = pd.DataFrame(pfit.transform(X_train),columns = (v_skew + v_sym+v_onehot)
                              ,index=X_train.index)

X_test_transf = pd.DataFrame(pfit.transform(X_test),columns = (v_skew + v_sym+v_onehot)
                            ,index=X_test.index)

display(round(X_train_transf.describe(),2))

#vamos dar um check se o dataframe ficou "menos assimétrico"
skew_df(X_train_transf[v_skew+v_sym])

Unnamed: 0,Asst_Reg,Experience,Yearly_Income,Lend_Amount,Usage_Rate,Present_Balance,Unpaid_Amount,debt_to_income,Unpaid_2_years,Already_Defaulted,...,Florida,Newyork,TX,Other_Cat1,credit card,debt consolidation,home improvement,Other_Cat2,3years,5years
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.06,0.08,0.08,0.62,0.24,0.6,0.05,0.11,0.72,0.28
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.25,0.27,0.27,0.48,0.43,0.49,0.22,0.31,0.45,0.45
min,-2.27,-1.52,-3.99,-2.64,-2.37,-3.9,-3.46,-2.09,-0.48,-0.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.86,-0.83,-0.68,-0.74,-0.72,-0.8,-0.67,-0.76,-0.48,-0.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.0,0.05,0.02,-0.03,0.05,-0.06,-0.03,-0.01,-0.48,-0.07,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.93,1.08,0.69,0.77,0.76,0.85,0.67,0.74,-0.48,-0.07,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
max,1.48,1.08,2.83,1.99,2.88,2.08,3.21,2.41,2.1,13.93,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Asst_Reg,Experience,Yearly_Income,Lend_Amount,Usage_Rate,Present_Balance,Unpaid_Amount,debt_to_income,Unpaid_2_years,Already_Defaulted,Account_Open,Interest_Charged
skewness,-5.85,-9.02,0.0,-1.61,-6.95,-2.19,1.09,3.0,42.19,105.32,-0.0,-0.45
p_value,0.0,0.0,1.0,0.11,0.0,0.03,0.28,0.0,0.0,0.0,1.0,0.65


# Model Evaluation on test sample

In [15]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix,roc_auc_score, roc_curve

In [None]:
#Não usar bernoulli porque faz mais sentido para variaveis binarias
#from sklearn.naive_bayes import BernoulliNB

import seaborn as sns 

rnd_clf=RandomForestClassifier(n_estimators=100)
log_clf=LogisticRegression()
svm_clf=SVC()
dtc_clf=DecisionTreeClassifier()
knn_clf=KNeighborsClassifier()
mpl_clf=MLPClassifier()
gnb_clf=GaussianNB()
#Adicionar o resto dos modelos superviselearning menos o xgboost
#bnb_clf=BernoulliNB()


voting_clf=VotingClassifier(
    estimators=[('lr',log_clf),('rf',rnd_clf),('svm',svm_clf)
                ,('dtc',dtc_clf),('knn',knn_clf),('mpl',mpl_clf),('gnb',gnb_clf)],
    voting='hard'
)
 
dataMetrics = pd.DataFrame(columns=['Accuracy','Precision','Recall','F1'])
for i, clf in enumerate([log_clf,rnd_clf,svm_clf,dtc_clf,knn_clf,mpl_clf,gnb_clf,voting_clf], start=1):
    clf.fit(X_train_transf,y_train)
    y_pred=clf.predict(X_test_transf)   
    #DataFrame
    dataMetrics.loc[clf.__class__.__name__, ['Accuracy']] = metrics.accuracy_score(y_test, y_pred).round(2)
    dataMetrics.loc[clf.__class__.__name__, ['Precision']] = metrics.precision_score(y_test, y_pred).round(2)
    dataMetrics.loc[clf.__class__.__name__, ['Recall']] = metrics.recall_score(y_test, y_pred).round(2)
    dataMetrics.loc[clf.__class__.__name__, ['F1']] = metrics.f1_score(y_test, y_pred).round(2)
    #Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    if i % 2 == 0:
        sns.heatmap(cf_matrix, annot=True, fmt='g');
    else:
        sns.heatmap(cf_matrix, annot=True, fmt='g', cmap='Blues');
    plt.show()
    #Generate the ROC AUC Curve
    roc_auc = roc_auc_score(y_test, y_pred)
    #ROC AUC Curve
    fpr, tpr, thresholds = roc_curve(y_test,y_pred)
    plot_roc_curve(fpr,tpr)
    plt.show()
    
dataMetrics
    
#experimentar pesos, fazer grid search

#APLICAR TRAIN TEST UMAS 30 VEZES 

In [None]:
#Adicionar script dos classificador com hiperparametros
rnd_clf_best_params={'bootstrap': True,
                     'max_depth': 8,
                     'max_features': 'sqrt',
                    'min_samples_leaf': 6,
                     'min_samples_split': 20,
                     'n_estimators': 100}
log_clf_best_params={'C': 1}
svm_clf_best_params={'C': 1, 'kernel': 'rbf'}
dtc_clf_best_params={'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 6, 'min_samples_split': 10}
knn_clf_best_params={'leaf_size': 10,
                     'metric': 'euclidean',
                     'n_neighbors': 9,
                     'weights': 'distance'}
mpl_clf_best_params={'activation': 'logistic', 'hidden_layer_sizes': (8,), 'solver': 'adam'}
gnb_clf_best_params={}
#xgboost
xgb_clf_best_params={'gamma': 0,
                     'learning_rate': 0.01,
                     'max_depth': 3,
                     'n_estimators': 100,
                     'reg_alpha': 0.5,
                     'reg_lambda': 1,
                     'subsample': 0.8}

In [None]:
rnd_clf=RandomForestClassifier(**rnd_clf_best_params)
log_clf=LogisticRegression(**log_clf_best_params)
svm_clf=SVC(**svm_clf_best_params)
dtc_clf=DecisionTreeClassifier(**dtc_clf_best_params)
knn_clf=KNeighborsClassifier(**knn_clf_best_params)
mpl_clf=MLPClassifier(**mpl_clf_best_params)
gnb_clf=GaussianNB()

# Unsupervised Learning

## PCA

In [None]:
#Apply PCA to the training data to reduce the dimensionality
#ALWAYS STANDARDIZE
#VER SE VALE A PENA FAZER UNS GRAFICOS
#POR EXEMPLO
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=0.9999)

X_pca=pca.fit(X_train_transf[v_skew+v_sym])
# Determine explained variance using explained_variance_ration_ attribute
#
exp_var_pca = pca.explained_variance_ratio_
#
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
#
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

# Create the visualization plot
#
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# Use cross-validation to evaluate logistic regression with different numbers of principal components
pca = PCA()
logreg = LogisticRegression()

n_components = range(1, X_train_transf[v_skew+v_sym].shape[1]+1)
scores = []
display(X_train_transf[v_skew+v_sym].shape[1])

for n in n_components:
    pca.n_components = n
    X_train_pca = pca.fit_transform(X_train_transf[v_skew+v_sym])
    score = np.mean(cross_val_score(logreg, X_train_pca, y_train, cv=5, scoring='f1'))
    scores.append(score)

# Select the number of components that gives the highest cross-validation score
optimal_n_components = n_components[np.argmax(scores)]
display(scores)
#display(n_components)

# Train logistic regression model on full training set with optimal number of components
pca.n_components = optimal_n_components
display(optimal_n_components)

In [None]:
pca = PCA(n_components=optimal_n_components)

X_pca=pca.fit(X_train_transf[v_skew+v_sym])

X_train_num_pca=pd.DataFrame(pca.transform(X_train_transf[v_skew+v_sym]),
                         columns=['pca_v'+str(i+1) for i in range (0,X_pca.n_components_)],
                        index=X_train_transf.index)

X_test_num_pca=pd.DataFrame(pca.transform(X_test_transf[v_skew+v_sym]),
                         columns=['pca_v'+str(i+1) for i in range (0,X_pca.n_components_)],
                        index=X_test_transf.index)

In [None]:
import seaborn as sns

df_pca=pd.concat([X_train_num_pca,pd.DataFrame(y_train)],axis=1)
#display(df_pca.head(5))

sns.scatterplot(x='pca_v1', y='pca_v2', hue='Default', data=df_pca);



In [None]:
sns.scatterplot(x='pca_v6', y='pca_v7', hue='Default', data=df_pca);

In [None]:
corr = df_pca.corr()
display(round(corr.iloc[[-1],:],2))

# Plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,cmap="BuPu");

In [None]:
#criar X com PCA
X_train_pca=pd.concat(
    [X_train_num_pca,
        X_train_transf[X_train_transf.columns[-(len(X_train_transf.columns)-len(v_skew+v_sym)):]]],
     axis=1)

X_test_pca=pd.concat(
    [X_test_num_pca,
        X_test_transf[X_test_transf.columns[-(len(X_test_transf.columns)-len(v_skew+v_sym)):]]],
     axis=1)

## Clustering

In [None]:
sse=[]
silhscores = []
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
k_rng=range(1,15)
for k in k_rng:
    km=KMeans(n_clusters=k,n_init=10)
    km.fit(X_train_transf)
    sse.append(km.inertia_)
    if k>1:
        km_pred=km.predict(X_train_transf)
        silhscore = silhouette_score(X_train_transf, km_pred)
        silhscores.append(silhscore)
display(silhscores)    
sse

#--Lento

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.xticks(k_rng)
plt.plot(k_rng,sse, '-o');

In [None]:
# Plot the silhouette scores
plt.plot(range(2, 15), silhscores, '-o')
plt.xticks(range(2, 15))
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette score')
plt.show()

In [None]:
#clusters identified by silhouette
k_silh=range(2,15)[np.array(silhscores).argmax()]
display('The silhouette score has chosen '+str(k_silh)+' clusters.')
km=KMeans(n_clusters=k_silh,n_init=10)
km_fit=km.fit(X_train_transf)

In [None]:
#Centros dos clusters
cluster=km_fit.predict(X_train_transf)
unique, counts = np.unique(cluster, return_counts=True)
display(pd.DataFrame(np.asarray((unique, counts)).T,columns=['cluster','no of obs']))

#km_fit.cluster_centers_

In [None]:
#create dataframe
ohe=OneHotEncoder(sparse=False)
display(pd.concat([X_train_transf,pd.DataFrame(ohe.fit_transform(km_fit.labels_.reshape(-1, 1)),
columns=['cluster'+str(i) for i in range(0,km_fit.labels_.max()+1)],
                    index=X_train_transf.index)],axis=1))


In [None]:
# Create a grid search with cross-validation to find the best number of clusters
param_grid = {'kmeans__n_clusters': range(2, 15)}  # specify the range of cluster numbers to try
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_transf, y_train)

C=[0.001, 0.01, 0.1, 1, 10, 100]
scor=[]
for k in range(2,15):
    km=KMeans(n_clusters=k,n_init=10)
    km_fit=km.fit(X_train_transf)
    ohe=OneHotEncoder(sparse=False)
    X_cluster=pd.concat([X_train_transf,pd.DataFrame(ohe.fit_transform(km_fit.labels_.reshape(-1, 1)),
    columns=['cluster'+str(i) for i in range(0,km_fit.labels_.max()+1)],
                    index=X_train_transf.index)],axis=1)
    for i in C:
        logistic_regression = LogisticRegression(C=i)
        scor.append([k,i,cross_validate(logistic_regression, X_cluster, y_train, 
                           cv=5,scoring=['f1','recall'])['test_recall'].mean()])

In [None]:
np.array(scor)[np.array(scor)[:,2].argmax(),:]

k_cv=np.array(scor)[np.array(scor)[:,2].argmax(),0].astype(int)
C_cv=np.array(scor)[np.array(scor)[:,2].argmax(),1]
display(C_cv)

print('The best hyperparameters and f1 score are [k C f1]:'+str(np.array(scor)[np.array(scor)[:,2].argmax(),:]))

km=KMeans(n_clusters=k_cv,n_init=10)
km_fit=km.fit(X_train_transf)
ohe=OneHotEncoder(sparse=False)
X_cluster=pd.concat([X_train_transf,pd.DataFrame(ohe.fit_transform(km_fit.labels_.reshape(-1, 1)),
columns=['cluster'+str(i) for i in range(0,km_fit.labels_.max()+1)],
                    index=X_train_transf.index)],axis=1)
display(X_cluster.head(5))

cluster=km_fit.predict(X_train_transf)
#cluster=km_fit.predict(X_train_transf)
unique, counts = np.unique(cluster, return_counts=True)
display(pd.DataFrame(np.asarray((unique, counts)).T,columns=['cluster','no of obs']))

# Models hyperparameter tuning

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the logistic regression model
model = LogisticRegression()
# Define the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')

# Fit the model to the data
grid_search.fit(X_train_transf, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.2f}")

# Make predictions on new data
y_pred = grid_search.predict(X_test_transf)

## SVM

In [None]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the logistic regression model
#model = LinearSVC(loss='hinge',max_iter=10000)
model = SVC(max_iter=10000)
# Define the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'kernel': ['linear', 'rbf','sigmoid']}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1',n_jobs=-1)

# Fit the model to the data
grid_search.fit(X_train_transf, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.2f}")

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# Create the parameter grid
param_grid = {
    'max_depth': [2, 3 ,5, 8, 13, 21],
    'min_samples_leaf': [ 6, 8],
    'min_samples_split': [ 10, 20],
    'criterion': ['gini', 'entropy']
}

tree_clf=DecisionTreeClassifier()

# Create the grid search object
grid_search = GridSearchCV(estimator=tree_clf, param_grid=param_grid, cv=5,scoring='f1')

# Fit the grid search to the training data
tree_clf_cv=grid_search.fit(X_train_transf, y_train)

In [None]:
# Print the best parameters
display(tree_clf_cv.best_score_)
print(tree_clf_cv.best_params_)
best_tree_params=tree_clf_cv.best_params_
best_tree=DecisionTreeClassifier(**best_tree_params)
best_tree

In [None]:
from sklearn.tree import plot_tree
plot_tree(best_tree.fit(X_train_transf, y_train));

In [None]:
import graphviz
from sklearn.tree import export_graphviz

dot_data=export_graphviz(best_tree,feature_names=X_train_transf.columns,
                         class_names=['no default','default'],
                         filled=True)

graph=graphviz.Source(dot_data)
graph

## Naive Bayes

In [None]:
#não precisamos de adicionar hiperparametro devido ao facto de já termos a variancia normalizada das features
#no limite, hiperparametros só no pré processamento, ajustadando no no de bins 
from sklearn.preprocessing import KBinsDiscretizer
est=KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='quantile')
est.fit(X_train_tranf)
Xt_train=est.transform(X_train_transf)
Xt_test=est.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
mnNB=MultinomialNB()
mnNB.fit(Xt_train,y_train)
y_pred=mnNB.predict(Xt_test)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Create the pipeline
pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(encode='ordinal',strategy='quantile')),
    ('classifier', MultinomialNB())
])

# Create the parameter grid
param_grid = {
    'discretizer__n_bins': [3, 5, 7, 9]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train_transf, y_train)

# Print the best parameters
print(grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

display(grid_search.best_estimator_)

# Evaluate the best model on the test data
accuracy = best_model.score(X_test_transf, y_test)
print('Test accuracy:', accuracy)


In [None]:
from sklearn.naive_bayes import GaussianNB
gNB=GaussianNB()
gNB.fit(X_train_transf,y_train)
y_pred=gNB.predict(X_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid={'n_estimators':[int(x) for x in np.linspace(10,100,4)],
            'max_features':['auto','sqrt'],
            'max_depth': [2, 3 ,5, 8, 13, 21],
            'min_samples_leaf': [6, 8],
            'min_samples_split': [10, 20],
            'bootstrap':[True,False]
}

rf_clf=RandomForestClassifier()

# Create the grid search object
rf_clf_gs = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5,scoring='f1',n_jobs=-1)

# Fit the grid search to the training data
rf_clf_cv=rf_clf_gs.fit(X_train_transf, y_train)

In [None]:
# Print the best parameters
display(rf_clf_cv.best_score_)
best_rf_params=rf_clf_cv.best_params_
display(best_rf_params)
best_rf=RandomForestClassifier(**best_rf_params)
best_rf

## Kneighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create the parameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'leaf_size': [10, 30, 50, 70]
}

# Create the k-NN classifier
knn = KNeighborsClassifier()

# Create the grid search object
knn_gs = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5,scoring='f1',n_jobs=-1)


# Fit the grid search to the training data
knn_clf_cv=knn_gs.fit(X_train_transf, y_train)

In [None]:
# Print the best parameters
display(knn_clf_cv.best_score_)
best_knn_params=knn_clf_cv.best_params_
display(best_knn_params)
best_knn=KNeighborsClassifier(**best_knn_params)
best_knn

## Xgboost

In [15]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
# Define the parameters for the XGBoost model
xgbo_param_grid = {
    'max_depth': [3 ,5, 8],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50,100],
    'gamma': [0, 0.5, 1],
    'subsample': [0.5, 0.8, 1.0],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [ 0.5, 1]
}

# Create the k-NN classifier
xgbo = xgb.XGBClassifier(n_jobs=-1,verbose=10)

# Create the grid search object
xgbo_gs = GridSearchCV(estimator=xgbo, param_grid=xgbo_param_grid, cv=5,scoring='f1',n_jobs=-1,verbose=10)

# Fit the grid search to the training data
xgbo_clf_cv=xgbo_gs.fit(X_train_transf, y_train,verbose=10)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Parameters: { "verbose" } are not used.



In [16]:
# Print the best parameters
display(xgbo_clf_cv.best_score_)
best_xgbo_params=xgbo_clf_cv.best_params_
display(best_xgbo_params)
best_xgbo=xgb.XGBClassifier(**best_xgbo_params)
best_xgbo

0.7523584628308703

{'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 100,
 'reg_alpha': 0.5,
 'reg_lambda': 1,
 'subsample': 0.8}

## Redes neuronais

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


mlp_param_grid = {'hidden_layer_sizes': [(6,),(8,),(5,),(10,), (20,), (30,), (40,)],
              'solver': ['adam', 'sgd'],
              'activation': ['relu', 'tanh','logistic']}

mlp=MLPClassifier(max_iter=2000)

# Create the grid search object
mlp_gs = GridSearchCV(estimator=mlp, param_grid=mlp_param_grid, cv=5,scoring='f1',n_jobs=-1)

# Fit the grid search to the training data
mlp_clf_cv=mlp_gs.fit(X_train_transf, y_train)

In [None]:
# Print the best parameters
display(mlp_clf_cv.best_score_)
best_mlp_params=mlp_clf_cv.best_params_
display(best_mlp_params)
best_mlp=MLPClassifier(**best_mlp_params)
best_mlp

# Combinação de modelos

In [None]:
# Fit different models and evaluate their performance
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    MLPClassifier()
]

for model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__}: {accuracy:.2f}")
    
    #STACKING
# Define the base models
base_models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    MLPClassifier()
]

# Define the second-level model
meta_model = LogisticRegression()

# Define the stacking model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Fit the stacking model
stacking_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = stacking_model.predict(X_test)

# Evaluate the model
accuracy = stacking_model.score(X_test, y_test)
print("Accuracy:", accuracy)
