In [1]:
import pandas as pd

In [2]:
# Carregando base de dados

df = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name='Data')

In [3]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1


In [4]:
# Through the analysis of variables and the dictionary, some relevant information:


# Categorical variables:
# Personal Loan - Did this customer accept the personal loan offered in the last campaign? This is our target variable
# Securities Account - Does the customer have a securities account at the bank?
# CD Account - Does the customer have a certificate of deposit (CD) account at the bank?
# Online - Does the customer use Internet banking services?
# Credit Card - Does the customer use a credit card issued by UniversalBank?

# Continuous variables:

# Age Age of the customer
# Experience - years of experience
# Income - Annual income in dollars
# CCAvg (CCAvg) - average credit card spending
# Mortage - Mortgage value of the house

# Ordinal categorical variables:
# Family - size of the customer's family
# Education - customer education level


# The ID variable does not add any interesting information.
# There is no association between a person's customer ID and the loan, it also does not provide any general conclusion for potential future loan customers.
# We can disregard this information for the model.
# Zip code can also be disregarded, since the distance information between zip codes may not be true


In [5]:
#Deleting non-useful variables

df = df.drop(columns=["ID"])

In [6]:
# Basic statistics of variables

df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,5000.0,45.3384,11.463166,23.0,35.0,45.0,55.0,67.0
Experience,5000.0,20.1046,11.467954,-3.0,10.0,20.0,30.0,43.0
Income,5000.0,73.7742,46.033729,8.0,39.0,64.0,98.0,224.0
Family,5000.0,2.3964,1.147663,1.0,1.0,2.0,3.0,4.0
CCAvg,5000.0,1.937913,1.747666,0.0,0.7,1.5,2.5,10.0
Education,5000.0,1.881,0.839869,1.0,1.0,2.0,3.0,3.0
Mortgage,5000.0,56.4988,101.713802,0.0,0.0,0.0,101.0,635.0
Personal_Loan,5000.0,0.096,0.294621,0.0,0.0,0.0,0.0,1.0
Securities_Account,5000.0,0.1044,0.305809,0.0,0.0,0.0,0.0,1.0
CD_Account,5000.0,0.0604,0.23825,0.0,0.0,0.0,0.0,1.0


In [7]:
# Distribution of the target variable
df["Personal_Loan"].value_counts()

Personal_Loan
0    4520
1     480
Name: count, dtype: int64

In [8]:
# Dividing predictors (X) and target variable (Y)

X = df.drop(columns=["Personal_Loan"])
Y = df["Personal_Loan"]

In [9]:
# Dividing the base into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

print(f"Shape X_train: {X_train.shape}")
print(f"Shape y_train: {y_train.shape}")
print(f"Shape X_test: {X_test.shape}")
print(f"Shape y_test: {y_test.shape}")

Shape X_train: (4000, 11)
Shape y_train: (4000,)
Shape X_test: (1000, 11)
Shape y_test: (1000,)


In [10]:
# Normalization of variables

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Transform to dataframe for visualization
X_train = pd.DataFrame(X_train_scaled,columns = X_train.columns)
X_test = pd.DataFrame(X_test_scaled,columns = X_test.columns)

In [11]:
# Classificador Multi Layer Perceptron

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

clf = MLPClassifier( hidden_layer_sizes=(100, 50, 20),activation='logistic')
clf.fit(X_train,y_train)



MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 50, 20))

In [12]:
y_pred = clf.predict(X_test)
print(f"Acurácia: {accuracy_score(y_test, y_pred)*100}%")
print(f"F1 Score: {f1_score(y_test, y_pred)*100}%")

Acurácia: 98.0%
