In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
data_dict = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df = [train, test]

In [3]:
data_dict

Unnamed: 0,Variable Name,Description
0,customer_id,Identifier for customers
1,customer_bod,Date of birth of the customer
2,gender,Gender of the customer
3,phone_flag,If Mobile no. was shared by the customer then ...
4,student,If customer a student then labeled Yes
5,employment,Employment Type of the customer (Salaried/Self...
6,credit_card,If customer has credit cards then flagged as 1
7,balance,Total principal outstanding amount of the acti...
8,income,Estimated income of customer
9,tenure,Loan tenure


In [4]:
train.head()

Unnamed: 0,customer_id,customer_bod,gender,phone_flag,student,employment,credit_card,balance,income,tenure,default
0,8300,1993-08-17,Female,1.0,No,Self Employed,1.0,87104.12,5015120.75,4yrs 4mon,0
1,672,2007-12-17,Female,1.0,Yes,,0.0,89236.34,2266076.58,4yrs 1mon,0
2,5670,2000-02-05,Female,1.0,Yes,,0.0,171553.12,1779347.34,0yrs 9mon,0
3,2975,1999-11-16,Female,1.0,Yes,,0.0,85979.04,2014246.24,1yrs 8mon,0
4,3883,1977-08-18,Male,1.0,No,Salaried,0.0,48874.77,5445148.31,0yrs 10mon,0


In [5]:
test.head()

Unnamed: 0,customer_id,customer_bod,gender,phone_flag,student,employment,credit_card,balance,income,tenure
0,9365,1999-10-22,Male,0.0,No,Salaried,1.0,0.0,4430744.15,0yrs 0mon
1,999,1987-05-03,Female,0.0,No,Salaried,0.0,67431.4,3743149.53,0yrs 7mon
2,2835,2000-10-20,Male,1.0,No,Salaried,1.0,69128.28,4821579.61,1yrs 11mon
3,5821,1994-07-13,Male,1.0,No,Salaried,0.0,151051.78,3159764.69,0yrs 10mon
4,2330,1996-10-03,Male,0.0,No,Self Employed,0.0,92727.85,5659353.2,1yrs 3mon


In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2471 entries, 0 to 3691
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customer_id   2471 non-null   int64  
 1   gender        2471 non-null   float64
 2   phone_flag    2471 non-null   float64
 3   student       2471 non-null   int64  
 4   employment    2471 non-null   float64
 5   credit_card   2471 non-null   float64
 6   balance       2471 non-null   float64
 7   income        2471 non-null   float64
 8   default       2471 non-null   int64  
 9   age           2471 non-null   int64  
 10  tenure_year   2471 non-null   object 
 11  tenure_month  2471 non-null   object 
dtypes: float64(6), int64(4), object(2)
memory usage: 251.0+ KB


In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customer_id   307 non-null    int64  
 1   gender        293 non-null    float64
 2   phone_flag    307 non-null    float64
 3   student       307 non-null    int64  
 4   employment    222 non-null    float64
 5   credit_card   305 non-null    float64
 6   balance       307 non-null    float64
 7   income        307 non-null    float64
 8   age           307 non-null    int64  
 9   tenure_year   307 non-null    object 
 10  tenure_month  307 non-null    object 
dtypes: float64(6), int64(3), object(2)
memory usage: 26.5+ KB


In [8]:
# Preprocessing
sex = {'Female':1, 'Male':0}
for file in df:
    file['gender'] = file['gender'].map(sex)

In [9]:
student={'Yes':1, 'No':0}
for file in df:
    file['student'] = file['student'].map(student)

In [10]:
employ = {'Salaried':1, 'Self Employed':0}
for file in df:
    file['employment'] = file['employment'].map(employ)

In [11]:
train = train.dropna()

In [12]:
from datetime import datetime, date
year = date.today().year

train['customer_bod'] = pd.to_datetime(train['customer_bod'], format="%Y-%m-%d")
test['customer_bod'] = pd.to_datetime(test['customer_bod'], format="%Y-%m-%d")

In [13]:
train['age'] = year - train['customer_bod'].dt.year
test['age'] = year - test['customer_bod'].dt.year

In [14]:
tenure_year = []
tenure_month = []
for tenure in train['tenure']:
    tenure_year.append(tenure[0])
    tenure_month.append(tenure[5])

In [16]:
train['tenure_year'] = tenure_year
train['tenure_month'] = tenure_month

In [17]:
tenure_year_test = []
tenure_month_test = []
for tenure in test['tenure']:
    tenure_year_test.append(tenure[0])
    tenure_month_test.append(tenure[5])
    
test['tenure_year'] = tenure_year_test
test['tenure_month'] = tenure_month_test

In [18]:
train = train.drop(['customer_bod', 'tenure'], axis=1)
test = test.drop(['customer_bod', 'tenure'], axis=1)

In [19]:
# modelling
import sklearn
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [20]:
X = train.drop(['default', 'customer_id'], axis=1)
y = train['default']

In [21]:
X_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.3) 

In [22]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [41]:
y_hasil = model.predict(x_val)
print(classification_report(y_val, y_hasil))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       695
           1       0.59      0.57      0.58        47

    accuracy                           0.95       742
   macro avg       0.78      0.77      0.78       742
weighted avg       0.95      0.95      0.95       742



In [24]:
features = []
for data in train:
    if data != 'default' and data != 'customer_id' :
        features.append(data)

In [25]:
print(export_text(model, feature_names=features))

|--- balance <= 197301.04
|   |--- balance <= 175814.23
|   |   |--- age <= 57.50
|   |   |   |--- balance <= 156159.56
|   |   |   |   |--- balance <= 123149.60
|   |   |   |   |   |--- tenure_month <= 7.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- tenure_month >  7.50
|   |   |   |   |   |   |--- income <= 5524856.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- income >  5524856.00
|   |   |   |   |   |   |   |--- income <= 5546797.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- income >  5546797.50
|   |   |   |   |   |   |   |   |--- balance <= 93537.85
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- balance >  93537.85
|   |   |   |   |   |   |   |   |   |--- balance <= 94542.83
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- balance >  94542.83
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |-

In [34]:
# Classification
from sklearn.naive_bayes import GaussianNB

test = test.drop(['customer_id', 'phone_flag', 'student'], axis=1)
test = test.dropna()

In [30]:
x_train = train.drop(['customer_id', 'phone_flag', 'student', 'default'], axis=1)
y_new_train = train['default']

In [31]:
model_nb = GaussianNB()
model_nb.fit(x_train, y_new_train)

GaussianNB()

In [35]:
final_score = model_nb.predict(test)

In [36]:
print(final_score)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1]
