# Cleaning data and creating custom features for Give Me Some Credit Kaggle Challenge

1. Cleaning the monthly income from NA values
2. Clean the debt ratio by replacing NA by the mean (Should ideally be done after splitting and the mean should be calculated on the training set if you want to do an evaluation)
3. Creating a montlhy debt feature
    * monthly income multiplied by debt ratio if income is not 0
    * debt ratio if income is 0
4. Create a Balanced Income feature that take into account Income and debt ratio
    * Set income to 0 when negative.
5. Clean the number of dependents feature
    * set NA to zero
6. Create a Blanced Income per household members feature
7. Cleaning the Number of Times Late feature
    * Create a custom categorical feature that contains 2 different tags for each row that contains a Number of time late of either 96 or 98
    * Remove the 96 and 98 values (Replacing those values by NA or some other justifiable value)
8. Add a feature that compute the weighted sum of the number of time late per duration
    * weight of 3 for 90 days and more
    * weight of 2 for 60 to 89 days
    * weight of 1 for 30 to 59 days


In [59]:
from pandas import DataFrame, read_csv, to_numeric
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
from tools import json_save_project, json_load_project
from bigml.api import BigML
import kaggle

from dataprocessing import *
from tools import *

%matplotlib inline

In [60]:
compression = True  #Do we compress the csv files?
project = 'gmsc'
version='v1.3'

In [61]:
suffix = version + '.csv'
if compression:
    suffix +='.bz2'

filename_fulltrain = 'gmsc-fulltrain-' + suffix
filename_train = 'gmsc-train80-' + suffix
filename_valid = 'gmsc-valid20-' + suffix
filename_test = 'gmsc-test-' + suffix

filename_bigml_ids = 'bigml-' + suffix

In [62]:
project_data = json_load_project(project, version)
project_data['processing']=[]

## Loading csv files as data frames

Files must be placed in the same directory as this file. Alternatively, modify the relative path to those files.

In [63]:
project_data['fulltrain_csv'] = 'cs-training.csv'
project_data['test_csv'] = 'cs-test.csv'

In [64]:
!pwd
#!ls
fulltrain=read_csv('./cs-training.csv',index_col=0)
test=read_csv('./cs-test.csv',index_col=0)

/home/guillaume/Devel/machine-learning/ML-notebooks/GiveMeSomeCredit


## Reordering the objective field column to the end

In [65]:
fulltrain=fulltrain[[c for c in fulltrain if c != 'SeriousDlqin2yrs']+['SeriousDlqin2yrs']]
test=test[[c for c in test if c != 'SeriousDlqin2yrs']+['SeriousDlqin2yrs']]

In [66]:
fixed_seed=12345
train80, test20 = train_test_split(fulltrain, test_size=0.2, random_state=fixed_seed)
data_sets=[fulltrain,train80,test20,test]
filenames=[filename_fulltrain,filename_train,filename_valid,filename_test]

## Correcting the values in monthly income
Set NaN to 0 in the monthly income column

In [67]:
args={'column': 'MonthlyIncome', 'value': 0}
for df in data_sets:
    process_fillna(df, **args)
    
project_data['processing'].append({'function': 'process_fillna', 'arguments': args})

## Cleaning Debt ratio
Set NA values to the mode from the training set

In [68]:
debt_ratio_mode = float(train80['DebtRatio'].mode())

In [69]:
args={'column': 'DebtRatio', 'value': debt_ratio_mode}
for df in data_sets:
    process_fillna(df, **args)
    
project_data['processing'].append({'function': 'process_fillna', 'arguments': args})


## Creating a new column for monthly debt.

In [70]:
for df in data_sets:
    df.insert(5,"MonthlyDebt",0)
data_sets[0].head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
1,0.766127,45,2,0.802982,9120.0,0,13,0,6,0,2.0,1
2,0.957151,40,0,0.121876,2600.0,0,4,0,0,0,1.0,0
3,0.65818,38,1,0.085113,3042.0,0,2,1,0,0,0.0,0
4,0.23381,30,0,0.03605,3300.0,0,5,0,0,0,0.0,0
5,0.907239,49,1,0.024926,63588.0,0,7,0,1,0,0.0,0
6,0.213179,74,0,0.375607,3500.0,0,3,0,1,0,1.0,0
7,0.305682,57,0,5710.0,0.0,0,8,0,3,0,0.0,0
8,0.754464,39,0,0.20994,3500.0,0,8,0,0,0,0.0,0
9,0.116951,27,0,46.0,0.0,0,2,0,0,0,,0
10,0.189169,57,0,0.606291,23684.0,0,9,0,4,0,2.0,0


If there is no income set DebtRatio to the mean debt ratio.

In [71]:
args = {'debt_ratio_fill_value':None}
for df in data_sets:
    monthly_debt(df, **args)

project_data['processing'].append({'function': 'monthly_debt', 'arguments': args})    
data_sets[0][['MonthlyDebt','DebtRatio','MonthlyIncome']].head(10)

Unnamed: 0,MonthlyDebt,DebtRatio,MonthlyIncome
1,7323.197016,0.802982,9120.0
2,316.878123,0.121876,2600.0
3,258.914887,0.085113,3042.0
4,118.963951,0.03605,3300.0
5,1584.975094,0.024926,63588.0
6,1314.624392,0.375607,3500.0
7,5710.0,,0.0
8,734.790059,0.20994,3500.0
9,46.0,,0.0
10,14359.393699,0.606291,23684.0


## Creating a balanced Income feature

In [72]:
for df in data_sets:
    #df.insert(6,'BalancedIncome',0)
    df.insert(6,'BalancedIncome',None)

In [73]:
for df in data_sets:
    balanced_income(df)
#    income_positive = df['MonthlyIncome'] > df['MonthlyDebt']
#    df.loc[income_positive,'BalancedIncome']= df.loc[income_positive,'MonthlyIncome'] - df.loc[income_positive,'MonthlyDebt']
project_data['processing'].append({'function': 'balanced_income', 'arguments': {}})    
data_sets[0].head(20)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,BalancedIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
1,0.766127,45,2,0.802982,9120.0,7323.197016,1796.8,13,0,6,0,2.0,1
2,0.957151,40,0,0.121876,2600.0,316.878123,2283.12,4,0,0,0,1.0,0
3,0.65818,38,1,0.085113,3042.0,258.914887,2783.09,2,1,0,0,0.0,0
4,0.23381,30,0,0.03605,3300.0,118.963951,3181.04,5,0,0,0,0.0,0
5,0.907239,49,1,0.024926,63588.0,1584.975094,62003.0,7,0,1,0,0.0,0
6,0.213179,74,0,0.375607,3500.0,1314.624392,2185.38,3,0,1,0,1.0,0
7,0.305682,57,0,,0.0,5710.0,,8,0,3,0,0.0,0
8,0.754464,39,0,0.20994,3500.0,734.790059,2765.21,8,0,0,0,0.0,0
9,0.116951,27,0,,0.0,46.0,,2,0,0,0,,0
10,0.189169,57,0,0.606291,23684.0,14359.393699,9324.61,9,0,4,0,2.0,0


## Cleaning the number of dependents column
Set to 0 the number of dependents when not available.

In [74]:
args = {'column': 'NumberOfDependents', 'value': 0}
for df in data_sets:
    process_fillna(df, **args)
#    df.loc[:,'NumberOfDependents'].fillna(0, inplace=True)
#    df.loc[:,'NumberOfDependents']=to_numeric(df['NumberOfDependents'],downcast='integer')

project_data['processing'].append({'function': 'process_fillna', 'arguments': args})     
data_sets[0][['NumberOfDependents']].head(10)

Unnamed: 0,NumberOfDependents
1,2.0
2,1.0
3,0.0
4,0.0
5,0.0
6,1.0
7,0.0
8,0.0
9,0.0
10,2.0


## Add a balanced income per household members feature

In [75]:
for df in data_sets:
    df.insert(7,'IncomePerHouseholdMember',0)

In [76]:
for df in data_sets:
    income_per_hm(df)
#    df.loc[:,'IncomePerHouseholdMember']= df['BalancedIncome'] / (df['NumberOfDependents']+1)

project_data['processing'].append({'function': 'income_per_hm', 'arguments': {}})   
data_sets[0][['IncomePerHouseholdMember']].head(20)

Unnamed: 0,IncomePerHouseholdMember
1,598.934
2,1141.56
3,2783.09
4,3181.04
5,62003.0
6,1092.69
7,
8,2765.21
9,
10,3108.2


## Adding a "number of times late" categorical feature

In [77]:
for df in data_sets:
    df.insert(8,'LateCategory',0)

In [78]:
data_sets[0].loc[data_sets[0]['NumberOfTimes90DaysLate']==98,['LateCategory','NumberOfTimes90DaysLate']].head(20)

Unnamed: 0,LateCategory,NumberOfTimes90DaysLate
1734,0,98
2287,0,98
3885,0,98
4418,0,98
4706,0,98
5074,0,98
6281,0,98
7033,0,98
7118,0,98
7688,0,98


In [79]:
for df in data_sets:
    late_category(df)
    # df.loc[:,'LateCategory']="L0"
    # df.loc[df['NumberOfTimes90DaysLate'] == 98,'LateCategory']="L98"
    # df.loc[df['NumberOfTimes90DaysLate'] == 96,'LateCategory']="L96"
    # df.loc[(df['NumberOfTimes90DaysLate'] == 98) | (df['NumberOfTimes90DaysLate'] == 96),
    #           ['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate']]=None

project_data['processing'].append({'function': 'late_category', 'arguments': {}})        
data_sets[0].loc[(data_sets[0]['LateCategory']=='L96') | (data_sets[0]['LateCategory']=='L98'),['LateCategory','NumberOfTimes90DaysLate']].head(20)

Unnamed: 0,LateCategory,NumberOfTimes90DaysLate
1734,L98,
2287,L98,
3885,L98,
4418,L98,
4706,L98,
5074,L98,
6281,L98,
7033,L98,
7118,L98,
7688,L98,


## Add a "Late score" feature

In [80]:
for df in data_sets:
    df.insert(0,'LateScore',0)

In [81]:
weight=[1,2,3]
args = {'weight': weight}
for df in data_sets:
    late_score(df, **args)
#    df.loc[:,'LateScore']=weight[2]*df['NumberOfTimes90DaysLate']+weight[1]*df['NumberOfTime60-89DaysPastDueNotWorse']+weight[0]*df['NumberOfTime30-59DaysPastDueNotWorse']

project_data['processing'].append({'function': 'late_score', 'arguments': args})
data_sets[0][['LateScore']].head(10) 

Unnamed: 0,LateScore
1,2.0
2,0.0
3,4.0
4,0.0
5,1.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


## Remove the features we don't need

In [82]:
print(list(data_sets[0]))

['LateScore', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'MonthlyDebt', 'BalancedIncome', 'IncomePerHouseholdMember', 'LateCategory', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents', 'SeriousDlqin2yrs']


In [83]:
columns = ['NumberOfTime30-59DaysPastDueNotWorse', 
           'NumberOfTime60-89DaysPastDueNotWorse',
           'NumberOfTimes90DaysLate',
           'NumberOfDependents']
args = {'columns': columns}
for df in data_sets:
    drop_columns(df, columns)
#    df.drop('NumberOfTime30-59DaysPastDueNotWorse', axis=1, inplace=True)
#    df.drop('NumberOfTime60-89DaysPastDueNotWorse', axis=1, inplace=True)
#    df.drop('NumberOfTimes90DaysLate', axis=1, inplace=True)
#    df.drop('NumberOfDependents', axis=1, inplace=True)

project_data['processing'].append({'function': 'drop_columns', 'arguments': args})

In [87]:
json_save_project(project_data, project, version)
project_data

{'mods': ['monthly_income_fillna'],
 'fulltrain_csv': 'cs-training.csv',
 'test_csv': 'cs-test.csv',
 'processing': [{'function': 'process_fillna',
   'arguments': {'column': 'MonthlyIncome', 'value': 0}},
  {'function': 'process_fillna',
   'arguments': {'column': 'DebtRatio', 'value': 0.0}},
  {'function': 'monthly_debt', 'arguments': {'debt_ratio_fill_value': None}},
  {'function': 'balanced_income', 'arguments': {}},
  {'function': 'process_fillna',
   'arguments': {'column': 'NumberOfDependents', 'value': 0}},
  {'function': 'income_per_hm', 'arguments': {}},
  {'function': 'late_category', 'arguments': {}},
  {'function': 'late_score', 'arguments': {'weight': [1, 2, 3]}},
  {'function': 'drop_columns',
   'arguments': {'columns': ['NumberOfTime30-59DaysPastDueNotWorse',
     'NumberOfTime60-89DaysPastDueNotWorse',
     'NumberOfTimes90DaysLate',
     'NumberOfDependents']}}]}

In [85]:
print(list(data_sets[0]))

['LateScore', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio', 'MonthlyIncome', 'MonthlyDebt', 'BalancedIncome', 'IncomePerHouseholdMember', 'LateCategory', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'SeriousDlqin2yrs']


## Save the data frames as csv files

In [86]:
for i in range(4):
    data_sets[i].to_csv(filenames[i],index_label='Id')