In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Read dataset files
test_df = pd.read_csv('dataset/test.csv')
train_df = pd.read_csv('dataset/train.csv')

In [3]:
#See what the dataset is like
train_df.head()

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0


In [4]:
#Drop columns we don't need
test_df = test_df.drop(['customer_id', 'name', 'no_of_days_employed'], axis=1)
train_df = train_df.drop(['customer_id', 'name', 'no_of_days_employed'], axis=1)

In [5]:
#Drop rows with missing values
test_df = test_df.dropna(how='any')
train_df = train_df.dropna(how='any')

In [6]:
#Drop row with invalid gender
test_df = test_df.loc[test_df["gender"] != 'XNA']
train_df = train_df.loc[train_df["gender"] != 'XNA']

In [7]:
#Convert gender, owns_car, owns_house, occupation_type to binary encoding
test_df = pd.get_dummies(test_df, columns=["gender", "owns_car", "owns_house", "occupation_type"])
train_df = pd.get_dummies(train_df, columns=["gender", "owns_car", "owns_house", "occupation_type"])

In [8]:
test_df.head()

Unnamed: 0,age,no_of_children,net_yearly_income,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,...,occupation_type_Low-skill Laborers,occupation_type_Managers,occupation_type_Medicine staff,occupation_type_Private service staff,occupation_type_Realty agents,occupation_type_Sales staff,occupation_type_Secretaries,occupation_type_Security staff,occupation_type_Unknown,occupation_type_Waiters/barmen staff
0,52,0.0,232640.53,2.0,0.0,14406.73,26524.4,4,779.0,0,...,0,0,0,0,0,0,0,0,1,0
1,48,1.0,284396.79,3.0,0.0,57479.99,68998.72,70,806.0,0,...,0,0,0,0,0,0,0,0,1,0
2,50,1.0,149419.28,3.0,0.0,21611.01,25187.8,71,528.0,2,...,0,0,0,0,0,0,0,0,1,0
3,30,1.0,160437.54,2.0,1.0,28990.76,29179.39,9,815.0,0,...,0,0,0,0,0,0,0,0,0,0
4,52,0.0,233480.37,2.0,1.0,54213.72,82331.82,82,613.0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train_df.head()

Unnamed: 0,age,no_of_children,net_yearly_income,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,...,occupation_type_Low-skill Laborers,occupation_type_Managers,occupation_type_Medicine staff,occupation_type_Private service staff,occupation_type_Realty agents,occupation_type_Sales staff,occupation_type_Secretaries,occupation_type_Security staff,occupation_type_Unknown,occupation_type_Waiters/barmen staff
0,46,0.0,107934.04,1.0,1.0,33070.28,18690.93,73,544.0,2,...,0,0,0,0,0,0,0,0,1,0
1,29,0.0,109862.62,2.0,0.0,15329.53,37745.19,52,857.0,0,...,0,0,0,0,0,0,0,0,0,0
2,37,0.0,230153.17,2.0,0.0,48416.6,41598.36,43,650.0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,0.0,122325.82,2.0,0.0,22574.36,32627.76,20,754.0,0,...,0,0,0,0,0,0,0,0,0,0
4,46,0.0,387286.0,1.0,0.0,38282.95,52950.64,75,927.0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#Going to work with just the train_df dataset since it has over 40,000 entries
X = train_df.drop('credit_card_default', axis=1)
y = train_df['credit_card_default']
target_names = ["no-default", "default"]

print(X.shape, y.shape)

(43952, 36) (43952,)


In [11]:
#Split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,age,no_of_children,net_yearly_income,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,...,occupation_type_Low-skill Laborers,occupation_type_Managers,occupation_type_Medicine staff,occupation_type_Private service staff,occupation_type_Realty agents,occupation_type_Sales staff,occupation_type_Secretaries,occupation_type_Security staff,occupation_type_Unknown,occupation_type_Waiters/barmen staff
25584,52,1.0,175426.26,3.0,1.0,28569.67,27957.03,28,825.0,0,...,0,0,0,0,0,0,0,0,0,0
14164,27,0.0,82421.9,2.0,0.0,26132.81,10951.73,39,737.0,0,...,0,0,0,0,0,0,0,0,0,0
15426,32,2.0,1041396.95,4.0,0.0,35180.06,225400.02,98,594.0,1,...,0,1,0,0,0,0,0,0,0,0
30826,40,1.0,308043.54,2.0,0.0,49684.42,55956.78,18,700.0,0,...,0,0,0,0,0,0,0,0,0,0
25773,25,0.0,141208.42,2.0,0.0,13441.83,32545.25,3,828.0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Scale the X_train data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

# Transform the X_train and X_test
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
#SVM Model
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [14]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.980


In [15]:
 # Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

  no-default       0.98      1.00      0.99     10092
     default       1.00      0.76      0.86       896

    accuracy                           0.98     10988
   macro avg       0.99      0.88      0.93     10988
weighted avg       0.98      0.98      0.98     10988

