## Setup

In [14]:
from dslabs_functions import get_variable_types
from seaborn import heatmap
from dslabs_functions import HEIGHT, plot_multi_scatters_chart
from matplotlib.pyplot import figure, subplots, savefig, show, gcf
from dslabs_functions import plot_bar_chart
from dslabs_functions import set_chart_labels
from dslabs_functions import define_grid, HEIGHT
from matplotlib.figure import Figure
from numpy import ndarray
from dslabs_functions import *
from pandas import read_csv, DataFrame
from numpy import log
from pandas import Series
from scipy.stats import norm, expon, lognorm
from matplotlib.axes import Axes
from dslabs_functions import plot_multiline_chart

In [128]:
filename = "datasets/class_credit_score.csv"
file_tag = "credit_score"
data: DataFrame = read_csv(filename, na_values="", index_col="ID")

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option("display.max_colwidth", 200)

## Data Preparation

### Cleaning

In [129]:
# Remove non-digits from age column
data['Age'] = data['Age'].str.replace(r'[^0-9]+', '', regex=True)

# Drop name column
data = data.drop(columns=['Name'])

# Leave only area code for SSN
data['SSN'] = data['SSN'].str.slice(stop=3)
data = data.rename(columns = {'SSN': 'SSN_Area_Code'})

data

Unnamed: 0_level_0,Customer_ID,Month,Age,SSN_Area_Code,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,NumofLoan,Type_of_Loan,Delay_from_due_date,NumofDelayedPayment,ChangedCreditLimit,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,Credit_History_Age,Payment_of_Min_Amount,TotalEMIpermonth,Amountinvestedmonthly,Payment_Behaviour,MonthlyBalance,Credit_Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0x1602,CUS_0xd40,January,23,821,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",3,7.0,11.27,4.0,,809.98,26.822620,22 Years and 1 Months,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
0x1603,CUS_0xd40,February,23,821,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",-1,,11.27,4.0,Good,809.98,31.944960,,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629163,Good
0x1604,CUS_0xd40,March,500,821,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",3,7.0,,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
0x1605,CUS_0xd40,April,23,821,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",5,4.0,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.458074,Low_spent_Small_value_payments,223.451310,Good
0x1606,CUS_0xd40,May,23,821,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0x25fe9,CUS_0x942c,April,25,078,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",23,7.0,11.50,3.0,,502.38,34.663572,31 Years and 6 Months,No,35.104023,60.971333,High_spent_Large_value_payments,479.866228,Poor
0x25fea,CUS_0x942c,May,25,078,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",18,7.0,11.50,3.0,,502.38,40.565631,31 Years and 7 Months,No,35.104023,54.185950,High_spent_Medium_value_payments,496.651610,Poor
0x25feb,CUS_0x942c,June,25,078,Mechanic,39628.99,3359.415833,4,6,5729,2,"Auto Loan, and Student Loan",27,6.0,11.50,3.0,Good,502.38,41.255522,31 Years and 8 Months,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor
0x25fec,CUS_0x942c,July,25,078,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",20,,11.50,3.0,Good,502.38,33.638208,31 Years and 9 Months,No,35.104023,251.672582,Low_spent_Large_value_payments,319.164979,Good


In [130]:
def process_loan_type_entry(entry):
    loan_types_split = []
    type_list = entry.replace(' and ', ' ')
    type_list = type_list.split(', ')
    for loan_type in type_list:
        loan_types_split.append('Loan_Type_' + loan_type.strip().replace(' ', '_').replace('-', '_'))
    return loan_types_split

# Split loan types and reformat the strings

loan_copy = data['Type_of_Loan']
no_nans = data.dropna()
loan_values = no_nans['Type_of_Loan'].unique()

loan_types = []
for entry in loan_values:
    loan_types += process_loan_type_entry(entry)

loan_types_columns = set(loan_types)
loan_types_columns = list(loan_types_columns)
print(loan_types_columns)


# Create columns and add to dataframe

def columns_count_occurrences(column_names, list_to_count):
    column_values = dict.fromkeys(column_names, 0)
    for item in list_to_count:
        column_values[item] += 1
    return column_values


no_nans[loan_types_columns] = no_nans.apply(lambda row: columns_count_occurrences(loan_types_columns, process_loan_type_entry(row['Type_of_Loan'])), axis='columns', result_type='expand')

no_nans.head(7)

['Loan_Type_Payday_Loan', 'Loan_Type_Not_Specified', 'Loan_Type_Personal_Loan', 'Loan_Type_Auto_Loan', 'Loan_Type_Debt_Consolidation_Loan', 'Loan_Type_Home_Equity_Loan', 'Loan_Type_Mortgage_Loan', 'Loan_Type_Credit_Builder_Loan', 'Loan_Type_Student_Loan']


Unnamed: 0_level_0,Customer_ID,Month,Age,SSN_Area_Code,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,NumofLoan,Type_of_Loan,Delay_from_due_date,NumofDelayedPayment,ChangedCreditLimit,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,Credit_History_Age,Payment_of_Min_Amount,TotalEMIpermonth,Amountinvestedmonthly,Payment_Behaviour,MonthlyBalance,Credit_Score,Loan_Type_Payday_Loan,Loan_Type_Not_Specified,Loan_Type_Personal_Loan,Loan_Type_Auto_Loan,Loan_Type_Debt_Consolidation_Loan,Loan_Type_Home_Equity_Loan,Loan_Type_Mortgage_Loan,Loan_Type_Credit_Builder_Loan,Loan_Type_Student_Loan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0x1608,CUS_0xd40,July,23,821,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan",3,8.0,11.27,4.0,Good,809.98,22.537593,22 Years and 7 Months,No,49.574949,178.344067,Low_spent_Small_value_payments,244.565317,Good,0,0,1,1,0,1,0,1,0
0x160f,CUS_0x21b1,February,28,4,Teacher,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,7,1.0,7.42,2.0,Good,605.03,38.550848,26 Years and 8 Months,No,18.816215,40.391238,High_spent_Large_value_payments,484.591214,Good,0,0,0,0,0,0,0,1,0
0x1612,CUS_0x21b1,May,28,4,Teacher,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,3,1.0,6.42,2.0,Good,605.03,34.977895,26 Years and 11 Months,No,18.816215,130.11542,Low_spent_Small_value_payments,444.867032,Good,0,0,0,0,0,0,0,1,0
0x1613,CUS_0x21b1,June,28,4,Teacher,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,3,0.0,5.42,2.0,Good,605.03,33.38101,27 Years and 0 Months,No,18.816215,43.47719,High_spent_Large_value_payments,481.505262,Good,0,0,0,0,0,0,0,1,0
0x1615,CUS_0x21b1,August,28,4,Teacher,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,3,4.0,5.42,2.0,Good,605.03,32.933856,27 Years and 2 Months,No,18.816215,218.904344,Low_spent_Small_value_payments,356.078109,Good,0,0,0,0,0,0,0,1,0
0x161b,CUS_0x2dbc,February,34,486,Engineer,143162.64,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",13,6.0,7.1,3.0,Good,1303.01,41.702573,17 Years and 10 Months,No,246.99232,232.860384,High_spent_Small_value_payments,998.869297,Good,0,1,0,2,0,0,0,0,0
0x161f,CUS_0x2dbc,June,34,486,Engineer,143162.64,12187.22,1,5,8,967,"Auto Loan, Auto Loan, and Not Specified",8,6.0,7.1,3.0,Good,1303.01,39.783993,18 Years and 2 Months,No,246.99232,257.808099,High_spent_Medium_value_payments,963.921581,Good,0,1,0,2,0,0,0,0,0


In [131]:
# Can we drop "num of loan"?

no_nans[['NumofLoan', 'Type_of_Loan']].head(15)

Unnamed: 0_level_0,NumofLoan,Type_of_Loan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0x1608,4,"Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan"
0x160f,1,Credit-Builder Loan
0x1612,1,Credit-Builder Loan
0x1613,1,Credit-Builder Loan
0x1615,1,Credit-Builder Loan
0x161b,3,"Auto Loan, Auto Loan, and Not Specified"
0x161f,967,"Auto Loan, Auto Loan, and Not Specified"
0x1620,3,"Auto Loan, Auto Loan, and Not Specified"
0x1621,3,"Auto Loan, Auto Loan, and Not Specified"
0x1626,1,Not Specified


In [132]:
# Convert credit history to months

import re

def convert_age_to_months(age):
    list_of_numbers = re.findall(r'\b\d+\b', age)
    if (len(list_of_numbers) != 2):
        print(list_of_numbers)
        raise Exception('Incorrect age input')
    years, months = int(list_of_numbers[0]), int(list_of_numbers[1])
    total_months = years * 12 + months
    return total_months

no_nans['Credit_History_Age_Months'] = no_nans.apply(lambda row: convert_age_to_months(row['Credit_History_Age']), axis='columns', result_type='expand')

no_nans[['Credit_History_Age', 'Credit_History_Age_Months']].head()

Unnamed: 0_level_0,Credit_History_Age,Credit_History_Age_Months
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0x1608,22 Years and 7 Months,271
0x160f,26 Years and 8 Months,320
0x1612,26 Years and 11 Months,323
0x1613,27 Years and 0 Months,324
0x1615,27 Years and 2 Months,326


In [134]:
# Split payment behaviour and one-hot-encode

print(data['Payment_Behaviour'].unique())

['High_spent_Small_value_payments' 'Low_spent_Large_value_payments'
 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' nan 'High_spent_Large_value_payments']


### Variables encoding

The list of variables under each one of the transformations, shall be presented. If not applied explain the reason for that, based on data characteristics.

In [16]:
print(get_variable_types(data)['symbolic'])
for var in get_variable_types(data)['symbolic']:
    print(var + ':')
    print(data[var].describe())
    print(data[var].unique())
    print()

['Customer_ID', 'Month', 'Name', 'SSN', 'Occupation', 'Type_of_Loan', 'CreditMix', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Payment_Behaviour']
Customer_ID:
count        100000
unique        12500
top       CUS_0xd40
freq              8
Name: Customer_ID, dtype: object
['CUS_0xd40' 'CUS_0x21b1' 'CUS_0x2dbc' ... 'CUS_0xaf61' 'CUS_0x8600'
 'CUS_0x942c']

Month:
count      100000
unique          8
top       January
freq        12500
Name: Month, dtype: object
['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August']

Name:
count      90015
unique     10139
top       Langep
freq          44
Name: Name, dtype: object
['Aaron Maashoh' nan 'Rick Rothackerj' ... 'Chris Wickhamm'
 'Sarah McBridec' 'Nicks']

SSN:
count           94428
unique          12500
top       078-73-5990
freq                8
Name: SSN, dtype: object
['821-00-0265' nan '004-07-5839' ... '133-16-7738' '031-35-0942'
 '078-73-5990']

Occupation:
count      92938
unique        15
top       Lawyer
freq        

### Missing value imputation

### Outliers treatment

### Scaling

### Balancing

### Feature selection