<a href="https://colab.research.google.com/github/ckaarle/class/blob/training/preprocessing/credit_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [178]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

import matplotlib.pyplot as plt

import io

# Data

In [179]:
from google.colab import files

In [180]:
uploaded = files.upload()

Saving german.data to german (1).data
Saving german_credit_data.csv to german_credit_data (1).csv


In [181]:
df = pd.read_csv(io.StringIO(uploaded['german_credit_data.csv'].decode('utf-8')))  # https://www.kaggle.com/uciml/german-credit

In [182]:
labels = pd.read_csv(io.StringIO(uploaded['german.data'].decode('utf-8')), sep=' ', header=None)

In [183]:
labels = labels.iloc[:, 20]  # last column contains labels

In [184]:
df = pd.concat([df, labels], axis=1)

In [185]:
df = df.rename(columns={20: 'Risk'})

In [186]:
# 1: good risk, 2: bad risk --> convert to 0: no risk, 1: risk
df.loc[:, 'Risk'] = df['Risk'].apply(lambda x: x - 1)

## Clean

In [187]:
df.drop(columns=['Unnamed: 0'], inplace=True)

## Explore

In [188]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,0
1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,49,male,1,own,little,,2096,12,education,0
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,1


In [189]:
df.describe()

Unnamed: 0,Age,Job,Credit amount,Duration,Risk
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903,0.3
std,11.375469,0.653614,2822.736876,12.058814,0.458487
min,19.0,0.0,250.0,4.0,0.0
25%,27.0,2.0,1365.5,12.0,0.0
50%,33.0,2.0,2319.5,18.0,0.0
75%,42.0,2.0,3972.25,24.0,1.0
max,75.0,3.0,18424.0,72.0,1.0


In [190]:
def age_category(age):
  if 18 <= age <= 30:
    return '18-30'
  elif 30 < age <= 40:
    return '31-40'
  elif 40 < age <= 50:
    return '41-50'
  elif 50 < age <= 60:
    return '51-60'
  elif 60 < age <= 80:
    return '61-80'

In [191]:
df['Age'] = df['Age'].apply(lambda a: age_category(a))

## Encoding

In [192]:
def one_hot_encode(df, features_to_encode, prefixes):
  for feature, prefix in zip(features_to_encode, prefixes):
    dummies = pd.get_dummies(df[feature], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop([feature], axis=1)
  
  return df

In [193]:
df = one_hot_encode(df, ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Age'], ['Sex', 'Housing', 'Savings', 'Checking', 'Purpose', 'Age'])

## Analysis

In [194]:
df.corr()

Unnamed: 0,Job,Credit amount,Duration,Risk,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Savings_little,Savings_moderate,Savings_quite rich,Savings_rich,Checking_little,Checking_moderate,Checking_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Age_18-30,Age_31-40,Age_41-50,Age_51-60,Age_61-80
Job,1.0,0.285385,0.21091,0.032735,-0.070298,0.070298,0.134972,-0.059393,-0.039195,-0.00035,0.004472,-0.037499,-0.017128,-0.026413,-0.004059,-0.050099,0.001614,0.036762,-0.025979,-0.021672,0.013422,-0.027683,-0.092751,0.100544,-0.032804,0.060106,-0.035585,0.027532,-0.027173
Credit amount,0.285385,1.0,0.624984,0.154739,-0.093482,0.093482,0.201643,-0.117497,-0.024611,-0.036443,0.013546,-0.064256,-0.055542,-0.020912,0.119612,-0.10051,0.103016,0.125575,-0.069268,-0.034796,-0.034037,-0.173203,-0.028875,0.192893,-0.028669,0.006999,0.026773,0.013483,-0.011479
Duration,0.21091,0.624984,1.0,0.214927,-0.081432,0.081432,0.189117,-0.075169,-0.064417,-0.047228,0.051587,-0.040257,-0.048261,0.022244,0.089452,-0.076455,0.164113,-0.00532,-0.037212,-0.02545,-0.062804,-0.044319,-0.022549,0.104516,0.014986,0.015817,-0.012277,0.007117,-0.057887
Risk,0.032735,0.154739,0.214927,1.0,0.075493,-0.075493,0.081556,-0.134589,0.092785,0.161007,0.022255,-0.070954,-0.085749,0.258333,0.119581,-0.044009,0.036129,0.022621,0.008016,0.049085,0.020971,-0.106922,0.020828,0.028058,0.109549,-0.068117,-0.043343,0.005201,-0.036843
Sex_female,-0.070298,-0.093482,-0.081432,0.075493,1.0,-1.0,-0.100872,-0.119638,0.222845,0.031244,0.000498,-0.031414,0.041673,0.014834,0.012726,0.004183,-0.080875,-0.047893,0.045275,0.052397,0.100467,-0.008668,-0.026828,-0.014297,0.239899,-0.161286,-0.117131,0.025079,-0.030769
Sex_male,0.070298,0.093482,0.081432,-0.075493,-1.0,1.0,0.100872,0.119638,-0.222845,-0.031244,-0.000498,0.031414,-0.041673,-0.014834,-0.012726,-0.004183,0.080875,0.047893,-0.045275,-0.052397,-0.100467,0.008668,0.026828,0.014297,-0.239899,0.161286,0.117131,-0.025079,0.030769
Housing_free,0.134972,0.201643,0.189117,0.081556,-0.100872,0.100872,1.0,-0.548445,-0.162474,0.012353,-0.011914,-0.010662,-0.047989,0.067961,-0.000378,0.01586,-0.059613,0.126806,-0.038348,0.117976,-0.07153,-0.109357,0.013706,0.08001,-0.198989,0.006797,0.084261,0.161972,0.110968
Housing_own,-0.059393,-0.117497,-0.075169,-0.134589,-0.119638,0.119638,-0.548445,1.0,-0.735968,0.000276,-0.017738,0.000737,0.028707,-0.125705,0.001012,0.018934,0.051083,-0.099526,0.029317,-0.075684,-0.040496,0.134705,0.019803,-0.011288,-0.054109,0.097107,0.00726,-0.065718,-0.022234
Housing_rent,-0.039195,-0.024611,-0.064417,0.092785,0.222845,-0.222845,-0.162474,-0.735968,1.0,-0.010327,0.030578,0.007762,0.004979,0.09331,-0.000888,-0.035183,-0.012013,0.014773,-0.003546,-0.006211,0.1057,-0.070414,-0.034465,-0.05146,0.224961,-0.120091,-0.076788,-0.053592,-0.063608
Savings_little,-0.00035,-0.036443,-0.047228,0.161007,0.031244,-0.031244,0.012353,0.000276,-0.010327,1.0,-0.417624,-0.319568,-0.276736,0.246438,-0.047045,0.025329,-0.017202,-0.065771,-0.0232,-0.013679,0.100101,0.000728,0.010227,0.014341,0.071312,-0.052557,-0.011584,-0.024388,-0.001331


## Add skew to data set

Skew: Categorise all men above the age of 30 as a high risk.

In [195]:
df.loc[(df['Age_18-30'] == 0) & (df['Sex_male'] == 1), 'Risk'] = 1

In [196]:
df.corr()

Unnamed: 0,Job,Credit amount,Duration,Risk,Sex_female,Sex_male,Housing_free,Housing_own,Housing_rent,Savings_little,Savings_moderate,Savings_quite rich,Savings_rich,Checking_little,Checking_moderate,Checking_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Age_18-30,Age_31-40,Age_41-50,Age_51-60,Age_61-80
Job,1.0,0.285385,0.21091,0.066224,-0.070298,0.070298,0.134972,-0.059393,-0.039195,-0.00035,0.004472,-0.037499,-0.017128,-0.026413,-0.004059,-0.050099,0.001614,0.036762,-0.025979,-0.021672,0.013422,-0.027683,-0.092751,0.100544,-0.032804,0.060106,-0.035585,0.027532,-0.027173
Credit amount,0.285385,1.0,0.624984,0.148131,-0.093482,0.093482,0.201643,-0.117497,-0.024611,-0.036443,0.013546,-0.064256,-0.055542,-0.020912,0.119612,-0.10051,0.103016,0.125575,-0.069268,-0.034796,-0.034037,-0.173203,-0.028875,0.192893,-0.028669,0.006999,0.026773,0.013483,-0.011479
Duration,0.21091,0.624984,1.0,0.152973,-0.081432,0.081432,0.189117,-0.075169,-0.064417,-0.047228,0.051587,-0.040257,-0.048261,0.022244,0.089452,-0.076455,0.164113,-0.00532,-0.037212,-0.02545,-0.062804,-0.044319,-0.022549,0.104516,0.014986,0.015817,-0.012277,0.007117,-0.057887
Risk,0.066224,0.148131,0.152973,1.0,-0.40765,0.40765,0.158405,-0.020574,-0.103975,0.013954,0.005294,0.004218,-0.076783,0.097422,0.014276,-0.055916,0.039697,0.085261,-0.013725,0.03599,-0.072549,-0.079211,0.012152,0.024613,-0.493243,0.316571,0.190114,0.052033,0.061064
Sex_female,-0.070298,-0.093482,-0.081432,-0.40765,1.0,-1.0,-0.100872,-0.119638,0.222845,0.031244,0.000498,-0.031414,0.041673,0.014834,0.012726,0.004183,-0.080875,-0.047893,0.045275,0.052397,0.100467,-0.008668,-0.026828,-0.014297,0.239899,-0.161286,-0.117131,0.025079,-0.030769
Sex_male,0.070298,0.093482,0.081432,0.40765,-1.0,1.0,0.100872,0.119638,-0.222845,-0.031244,-0.000498,0.031414,-0.041673,-0.014834,-0.012726,-0.004183,0.080875,0.047893,-0.045275,-0.052397,-0.100467,0.008668,0.026828,0.014297,-0.239899,0.161286,0.117131,-0.025079,0.030769
Housing_free,0.134972,0.201643,0.189117,0.158405,-0.100872,0.100872,1.0,-0.548445,-0.162474,0.012353,-0.011914,-0.010662,-0.047989,0.067961,-0.000378,0.01586,-0.059613,0.126806,-0.038348,0.117976,-0.07153,-0.109357,0.013706,0.08001,-0.198989,0.006797,0.084261,0.161972,0.110968
Housing_own,-0.059393,-0.117497,-0.075169,-0.020574,-0.119638,0.119638,-0.548445,1.0,-0.735968,0.000276,-0.017738,0.000737,0.028707,-0.125705,0.001012,0.018934,0.051083,-0.099526,0.029317,-0.075684,-0.040496,0.134705,0.019803,-0.011288,-0.054109,0.097107,0.00726,-0.065718,-0.022234
Housing_rent,-0.039195,-0.024611,-0.064417,-0.103975,0.222845,-0.222845,-0.162474,-0.735968,1.0,-0.010327,0.030578,0.007762,0.004979,0.09331,-0.000888,-0.035183,-0.012013,0.014773,-0.003546,-0.006211,0.1057,-0.070414,-0.034465,-0.05146,0.224961,-0.120091,-0.076788,-0.053592,-0.063608
Savings_little,-0.00035,-0.036443,-0.047228,0.013954,0.031244,-0.031244,0.012353,0.000276,-0.010327,1.0,-0.417624,-0.319568,-0.276736,0.246438,-0.047045,0.025329,-0.017202,-0.065771,-0.0232,-0.013679,0.100101,0.000728,0.010227,0.014341,0.071312,-0.052557,-0.011584,-0.024388,-0.001331


## Train Test Split

In [197]:
train, test = train_test_split(df, test_size=0.2)

In [198]:
y_train = train['Risk']
y_test = test['Risk']

## Preprocessing

In [199]:
ct = make_column_transformer(
    (StandardScaler(), ['Job', 'Credit amount', 'Duration']),
)

In [200]:
ct.fit(train)

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Job', 'Credit amount', 'Duration'])])

In [201]:
train_transformed = ct.transform(train)
test_transformed = ct.transform(test)

In [202]:
train[['Job', 'Credit amount', 'Duration']] = train_transformed
test[['Job', 'Credit amount', 'Duration']] = test_transformed

## Store final data sets for reuse

In [203]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)