# Cleaning data and creating custom features for Give Me Some Credit Kaggle Challenge

1. Cleaning the monthly income from NA values
2. Clean the debt ratio by replacing NA by the mean (Should ideally be done after splitting and the mean should be calculated on the training set if you want to do an evaluation)
3. Creating a montlhy debt feature
    * monthly income multiplied by debt ratio if income is not 0
    * debt ratio if income is 0
4. Create a Balanced Income feature that take into account Income and debt ratio
5. Clean the number of dependents feature (TODO)
    * set NA to zero
6. Create a Blanced Income per household members feature (TODO)
7. Cleaning the Number of Times Late feature (TODO)
    * Remove the 96 and 98 values (Replacing those values by NA or some other justifiable value)
    * Create a custom categorical feature that contains 2 different tags for each row that contains a Number of time late of either 96 or 98
8. Add a feature that compute the weighted sum of the number of time late per duration (TODO)
    * weight of 3 for 90 days and more
    * weight of 2 for 60 to 89 days
    * weight of 1 for 30 to 59 days


In [115]:
from pandas import DataFrame, read_csv
from sklearn.model_selection import train_test_split
#import bigml.api
from bigml.api import BigML


## Loading csv files as data frames

Files must be placed in the same directory as this file. Alternatively, modify the relative path to those files.

In [116]:
!pwd
!ls
fulltrain=read_csv('./cs-training.csv',index_col=0)
test=read_csv('./cs-test.csv',index_col=0)

/home/devel/handson-ml2/ML-notebooks/GiveMeSomeCredit
README.md    cs-training.csv	    test-eval.csv
cs-test.csv  custom_features.ipynb  train.csv


## Reordering the objective field column at the end

In [117]:
print(list(fulltrain))
fulltrain=fulltrain[[c for c in fulltrain if c != 'SeriousDlqin2yrs']+['SeriousDlqin2yrs']]
print(list(fulltrain))

['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents', 'SeriousDlqin2yrs']


In [118]:
validation=True

if(validation==True):
    train80, test20 = train_test_split(fulltrain, test_size=0.2)
    data_sets=[train80,test20]
else:
    data_sets=[fulltrain,test]

## Correcting the values in monthly income
Set NaN to 0 in the monthly income column

In [119]:
help(DataFrame.fillna)

Help on function fillna in module pandas.core.frame:

fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
    Fill NA/NaN values using the specified method.
    
    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame). (values not
        in the dict/Series/DataFrame will not be filled). This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use NEXT valid observation to fill gap
    axis : {0 or 'index', 1 or 'columns'}
    inplace : boolean, default False
        If True, fill in place. Note: this will modify any

In [120]:
for df in data_sets:
    df.loc[:,'MonthlyIncome'].fillna(0, inplace=True)
    
data_sets[0].head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,13,0,0,0,0.0,0
130905,1.0,56,1,288.0,0.0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,16,0,1,0,0.0,0
29207,0.0,67,0,6.0,0.0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,16,0,1,0,3.0,0


## Cleaning Debt ratio
Set NA values to the mean from the training set

In [121]:
mean=data_sets[0]['DebtRatio'].mean

for df in data_sets:
    df.loc[:,'DebtRatio'].fillna(mean, inplace=True)
    
data_sets[0].head(20)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,13,0,0,0,0.0,0
130905,1.0,56,1,288.0,0.0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,16,0,1,0,0.0,0
29207,0.0,67,0,6.0,0.0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,16,0,1,0,3.0,0


## Creating a new column for monthly debt.

In [122]:
help(DataFrame.insert)

Help on function insert in module pandas.core.frame:

insert(self, loc, column, value, allow_duplicates=False)
    Insert column into DataFrame at specified location.
    
    Raises a ValueError if `column` is already contained in the DataFrame,
    unless `allow_duplicates` is set to True.
    
    Parameters
    ----------
    loc : int
        Insertion index. Must verify 0 <= loc <= len(columns)
    column : string, number, or hashable object
        label of the inserted column
    value : int, Series, or array-like
    allow_duplicates : bool, optional



In [123]:
for df in data_sets:
    df.insert(5,"MonthlyDebt",0)
data_sets[0].head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,0,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,0,13,0,0,0,0.0,0
130905,1.0,56,1,288.0,0.0,0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,0,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,0,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,0,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,0,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,0,16,0,1,0,0.0,0
29207,0.0,67,0,6.0,0.0,0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,0,16,0,1,0,3.0,0


In [124]:
help(DataFrame.apply)

Help on function apply in module pandas.core.frame:

apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
    
        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.
    broadcast : bool, optional
        Only relevant for aggregation functions:
    
        * ``False`` or ``None`` : returns a Se

The function calculate the monthly debt.
If the income is zero, we take the monthly debt from the debt ratio and set the debt ratio to 0.

In [125]:
def monthlyDebtCalc(row):
    if row['MonthlyIncome'] == 0:
        row['MonthlyDebt']=row['DebtRatio']
        row['DebtRatio']=0
    else:
        row['MonthlyDebt']=row['DebtRatio']*row['MonthlyIncome']
    return row
    

We apply the previous formula for each row of the data frame. This process might take some time

In [126]:
for df in data_sets:
    tmp=df[['DebtRatio','MonthlyDebt','MonthlyIncome']].apply(monthlyDebtCalc,axis=1)
    df.loc[:,['MonthlyDebt','DebtRatio']]=tmp[['MonthlyDebt','DebtRatio']]

data_sets[0][['MonthlyDebt','DebtRatio']].head(10)

Unnamed: 0,MonthlyDebt,DebtRatio
145326,1394.840824,0.159174
10843,574.80315,0.19685
130905,288.0,0.0
107083,2234.593708,0.40629
41539,9534.232884,4.767116
41088,2463.57837,0.421629
119784,2555.633177,0.36682
4481,1672.581856,0.418145
29207,6.0,0.0
11776,4546.393815,0.606186


## Creating a balanced Income feature

In [127]:
for df in data_sets:
    df.insert(6,'BalancedIncome',0)

In [128]:
for df in data_sets:
    df.loc[:,'BalancedIncome']= df['MonthlyIncome'] - df['MonthlyDebt']
data_sets[0].head(20)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,BalancedIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,1394.840824,7368.159176,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,574.80315,2345.19685,13,0,0,0,0.0,0
130905,1.0,56,1,0.0,0.0,288.0,-288.0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,2234.593708,3265.406292,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,9534.232884,-7534.232884,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,2463.57837,3379.42163,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,2555.633177,4411.366823,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,1672.581856,2327.418144,16,0,1,0,0.0,0
29207,0.0,67,0,0.0,0.0,6.0,-6.0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,4546.393815,2953.606185,16,0,1,0,3.0,0


In [129]:
data_sets[0].to_csv('train.csv',index_label='Id')
data_sets[1].to_csv('test-eval.csv',index_label='Id')
print("Done")

Done


## Cleaning the number of dependents column
Set to 0 the number of dependents when not available.

In [130]:
for df in data_sets:
    df.loc[:,'NumberOfDependents'].fillna(0, inplace=True)
    
data_sets[0].head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,BalancedIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,1394.840824,7368.159176,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,574.80315,2345.19685,13,0,0,0,0.0,0
130905,1.0,56,1,0.0,0.0,288.0,-288.0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,2234.593708,3265.406292,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,9534.232884,-7534.232884,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,2463.57837,3379.42163,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,2555.633177,4411.366823,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,1672.581856,2327.418144,16,0,1,0,0.0,0
29207,0.0,67,0,0.0,0.0,6.0,-6.0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,4546.393815,2953.606185,16,0,1,0,3.0,0


## Add a balanced income per household members feature

In [131]:
for df in data_sets:
    df.insert(7,'IncomePerHouseholdMember',0)

In [132]:
for df in data_sets:
    df.loc[:,'IncomePerHouseholdMember']= df['BalancedIncome'] / (df['NumberOfDependents']+1)
data_sets[0].head(20)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,MonthlyDebt,BalancedIncome,IncomePerHouseholdMember,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
145326,0.0,62,1,0.159174,8763.0,1394.840824,7368.159176,2456.053059,7,0,1,0,2.0,0
10843,0.029656,77,0,0.19685,2920.0,574.80315,2345.19685,2345.19685,13,0,0,0,0.0,0
130905,1.0,56,1,0.0,0.0,288.0,-288.0,-288.0,1,3,1,0,0.0,1
107083,0.335396,42,2,0.40629,5500.0,2234.593708,3265.406292,653.081258,9,0,1,0,4.0,0
41539,0.13814,36,0,4.767116,2000.0,9534.232884,-7534.232884,-2511.410961,16,0,8,0,2.0,0
41088,0.316938,34,0,0.421629,5843.0,2463.57837,3379.42163,3379.42163,19,0,2,0,0.0,0
119784,0.843559,59,0,0.36682,6967.0,2555.633177,4411.366823,2205.683411,10,0,1,1,1.0,0
4481,0.21502,61,0,0.418145,4000.0,1672.581856,2327.418144,2327.418144,16,0,1,0,0.0,0
29207,0.0,67,0,0.0,0.0,6.0,-6.0,-6.0,10,0,0,0,0.0,0
11776,0.385747,62,0,0.606186,7500.0,4546.393815,2953.606185,738.401546,16,0,1,0,3.0,0


## Send the data to BigML and create an ensemble model

In [133]:
#api = BigML(project='project/5d94a428eba31d460c00023f')

#train_src = api.create_source('train.csv')
#api.ok(train_src)
#test_src = api.create_source('test-eval.csv')
#api.ok(test_src)

#print("Sources created")

#train_ds = api.create_dataset(train_src)
#api.ok(train_ds)
#test_ds = api.create_dataset(test_src)
#api.ok(test_ds)

#print("Data sets created")

#model = api.create_ensemble(train_ds)
#api.ok(model)
#print("Model created")

## Validation or test batch prediction

In [134]:
#if (validation == True):
#    evaluation = api.get_evaluation(model, test_ds)
#    api.ok(evaluation)
#    api.pprint(evaluation['object']['result'])
#else:
#    batch_prediction = api.create_batch_prediction(model, test_ds, {
#        "name": "my batch prediction",
##        "all_fields": True,
##        "header": True,
#        "confidence": True})