# Cleaning data and creating custom features for Give Me Some Credit Kaggle Challenge

1. Cleaning the monthly income from NA values
2. Clean the debt ratio by replacing NA by the mean (Should ideally be done after splitting and the mean should be calculated on the training set if you want to do an evaluation)
3. Creating a montlhy debt feature
    * monthly income multiplied by debt ratio if income is not 0
    * debt ratio if income is 0
4. Create a Balanced Income feature that take into account Income and debt ratio
5. Clean the number of dependents feature
    * set NA to zero
6. Create a Blanced Income per household members feature
7. Cleaning the Number of Times Late feature
    * Create a custom categorical feature that contains 2 different tags for each row that contains a Number of time late of either 96 or 98
    * Remove the 96 and 98 values (Replacing those values by NA or some other justifiable value)
8. Add a feature that compute the weighted sum of the number of time late per duration
    * weight of 3 for 90 days and more
    * weight of 2 for 60 to 89 days
    * weight of 1 for 30 to 59 days


In [None]:
from pandas import DataFrame, read_csv, to_numeric
from sklearn.model_selection import train_test_split
from bigml.api import BigML


In [None]:
validation = False   #Do we split the data?
compression = True  #Do we compress the csv files?
send_to_BigML = False
build_model = False

version='v1.1'


## Loading csv files as data frames

Files must be placed in the same directory as this file. Alternatively, modify the relative path to those files.

In [None]:
!pwd
!ls
fulltrain=read_csv('./cs-training.csv',index_col=0)
test=read_csv('./cs-test.csv',index_col=0)

## Reordering the objective field column to the end

In [None]:
print(list(fulltrain))
fulltrain=fulltrain[[c for c in fulltrain if c != 'SeriousDlqin2yrs']+['SeriousDlqin2yrs']]
print(list(fulltrain))
if not validation:
    test=test[[c for c in fulltrain if c != 'SeriousDlqin2yrs']+['SeriousDlqin2yrs']]

In [None]:
if(validation==True):
    train80, test20 = train_test_split(fulltrain, test_size=0.2)
    data_sets=[train80,test20]
else:
    data_sets=[fulltrain,test]

## Correcting the values in monthly income
Set NaN to 0 in the monthly income column

In [None]:
help(DataFrame.fillna)

In [None]:
for df in data_sets:
    df.loc[:,'MonthlyIncome'].fillna(0, inplace=True)
    
data_sets[0].head(10)

## Cleaning Debt ratio
Set NA values to the mean from the training set

In [None]:
mean=data_sets[0]['DebtRatio'].mean

for df in data_sets:
    df.loc[:,'DebtRatio'].fillna(mean, inplace=True)
    
data_sets[0].head(20)

## Creating a new column for monthly debt.

In [None]:
help(DataFrame.insert)

In [None]:
for df in data_sets:
    df.insert(5,"MonthlyDebt",0)
data_sets[0].head(10)

In [None]:
#help(DataFrame.apply)

(removed:apply is slow!!)The function calculate the monthly debt.
If the income is zero, we take the monthly debt from the debt ratio and set the debt ratio to 0.

In [None]:
#def monthlyDebtCalc(row):
#    if row['MonthlyIncome'] == 0:
#        row['MonthlyDebt']=row['DebtRatio']
#        row['DebtRatio']=0
#    else:
#        row['MonthlyDebt']=row['DebtRatio']*row['MonthlyIncome']
#    return row
    

(removed:apply is slow!!)We apply the previous formula for each row of the data frame. This process might take some time

In [None]:
#for df in data_sets:
#    tmp=df[['DebtRatio','MonthlyDebt','MonthlyIncome']].apply(monthlyDebtCalc,axis=1)
#    df.loc[:,['MonthlyDebt','DebtRatio']]=tmp[['MonthlyDebt','DebtRatio']]

#data_sets[0][['MonthlyDebt','DebtRatio','MonthlyIncome']].head(10)

TODO: mask Should help here

In [None]:
for df in data_sets:
    df.loc[:,'MonthlyDebt']=df['DebtRatio']*df['MonthlyIncome']
    df.loc[df['MonthlyIncome'] == 0,'MonthlyDebt']=df.loc[df['MonthlyIncome'] == 0,'DebtRatio']
    df.loc[df['MonthlyIncome'] == 0,'DebtRatio']=0


data_sets[0][['MonthlyDebt','DebtRatio','MonthlyIncome']].head(10)

## Creating a balanced Income feature

In [None]:
for df in data_sets:
    df.insert(6,'BalancedIncome',0)

In [None]:
for df in data_sets:
    df.loc[:,'BalancedIncome']= df['MonthlyIncome'] - df['MonthlyDebt']
data_sets[0].head(20)

## Cleaning the number of dependents column
Set to 0 the number of dependents when not available.

In [None]:
for df in data_sets:
    df.loc[:,'NumberOfDependents'].fillna(0, inplace=True)
    df.loc[:,'NumberOfDependents']=to_numeric(df['NumberOfDependents'],downcast='integer')
    
data_sets[0][['NumberOfDependents']].head(10)

## Add a balanced income per household members feature

In [None]:
for df in data_sets:
    df.insert(7,'IncomePerHouseholdMember',0)

In [None]:
for df in data_sets:
    df.loc[:,'IncomePerHouseholdMember']= df['BalancedIncome'] / (df['NumberOfDependents']+1)
data_sets[0][['IncomePerHouseholdMember']].head(20)

## Adding a "number of times late" categorical feature

In [None]:
for df in data_sets:
    df.insert(8,'LateCategory',0)

In [None]:
data_sets[0].loc[data_sets[0]['NumberOfTimes90DaysLate']==98,['LateCategory','NumberOfTimes90DaysLate']].head(20)

In [None]:
#def monthlyDebtCalc(row):
#    if row['NumberOfTimes90DaysLate'] == 98:
#        row['LateCategory']="L98"
#        row['NumberOfTimes90DaysLate']=0
#        row['NumberOfTime60-89DaysPastDueNotWorse']=0
#        row['NumberOfTime30-59DaysPastDueNotWorse']=0
#    elif row['NumberOfTimes90DaysLate'] == 96:
#        row['LateCategory']="L96"
#        row['NumberOfTimes90DaysLate']=0
#        row['NumberOfTime60-89DaysPastDueNotWorse']=0
#        row['NumberOfTime30-59DaysPastDueNotWorse']=0
#    else:
#       row['LateCategory']="L0"
#   return row

In [None]:
for df in data_sets:
    df.loc[:,'LateCategory']="L0"
    df.loc[df['NumberOfTimes90DaysLate'] == 98,'LateCategory']="L98"
    df.loc[df['NumberOfTimes90DaysLate'] == 96,'LateCategory']="L96"
    df.loc[(df['NumberOfTimes90DaysLate'] == 98) | (df['NumberOfTimes90DaysLate'] == 96),
              ['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate']]="NA"
              
data_sets[0].loc[(data_sets[0]['LateCategory']=='L96') | (data_sets[0]['LateCategory']=='L98'),['LateCategory','NumberOfTimes90DaysLate']].head(20)

removed

In [None]:
#for df in data_sets:
#    tmp=df[['LateCategory','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse']].apply(monthlyDebtCalc,axis=1)
#    df.loc[:,['LateCategory','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse']]=tmp

#data_sets[0].loc[(data_sets[0]['LateCategory']=='L96') | (data_sets[0]['LateCategory']=='L98'),['LateCategory','NumberOfTimes90DaysLate']].head(20)

## Add a "Late score" feature

In [None]:
for df in data_sets:
    df.insert(0,'LateScore',0)

In [None]:
for df in data_sets:
    df.loc[:,'LateScore']=3*df['NumberOfTimes90DaysLate']+2*df['NumberOfTime60-89DaysPastDueNotWorse']+df['NumberOfTime30-59DaysPastDueNotWorse']
    
data_sets[0][['LateScore']].head(10) 

## Remove the features we don't need

In [None]:
print(list(data_sets[0]))

In [None]:
for df in data_sets:
    df.drop('NumberOfTime30-59DaysPastDueNotWorse', axis=1, inplace=True)
    df.drop('NumberOfTime60-89DaysPastDueNotWorse', axis=1, inplace=True)
    df.drop('NumberOfTimes90DaysLate', axis=1, inplace=True)

print(list(data_sets[0])) 

## Save the data frames as csv files

In [None]:
if compression:
    compress='.bz2'
else:
    compress=''

filenames = ["",""]
    
if validation:
    filenames[0]= 'gmsc-train80-' + version + '.csv'+ compress
    filenames[1]= 'gmsc-valid20-' + version + '.csv'+ compress
else:
    filenames[0]= 'gmsc-fulltrain-' + version + '.csv'+ compress
    filenames[1]= 'gmsc-test.csv-' + version + '.csv'+ compress

for i in range(0,2):
    data_sets[i].to_csv(filenames[i],index_label='Id')
                        
print("Done")

## Send the data to BigML and create an ensemble model

In [None]:
if send_to_BigML:
    api = BigML(project='project/5d94a428eba31d460c00023f')

    if validation:
        train_src = api.create_source('gmsc-train80.csv'+compress)
        api.ok(train_src)
        test_src = api.create_source('gmsc-valid20.csv'+compress)
        api.ok(test_src)
    else:
        train_src = api.create_source('gmsc-fulltrain.csv'+compress)
        api.ok(train_src)
        test_src = api.create_source('gmsc-test.csv'+compress)
        api.ok(test_src)

    print("Sources created")

In [None]:
#train_ds = api.create_dataset(train_src)
#api.ok(train_ds)
#test_ds = api.create_dataset(test_src)
#api.ok(test_ds)

#print("Data sets created")

#model = api.create_ensemble(train_ds)
#api.ok(model)
#print("Model created")

## Validation or test batch prediction

In [None]:
#if (validation == True):
#    evaluation = api.get_evaluation(model, test_ds)
#    api.ok(evaluation)
#    api.pprint(evaluation['object']['result'])
#else:
#    batch_prediction = api.create_batch_prediction(model, test_ds, {
#        "name": "my batch prediction",
##        "all_fields": True,
##        "header": True,
#        "confidence": True})