In [1]:
# fire up the packages
import pandas as pd
import numpy as np

In [2]:
# load the dataset
df = pd.read_csv("defaulter-dataset.csv", header=0)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df = df.set_index("Loan_ID")
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# feature data types
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [5]:
# get feature list
features = list(df.columns)

# get the list of catagorical features
categorical_features = [x for x in features if df[x].dtype == "object"]

# get numeric features
numeric_features = [x for x in features if df[x].dtype != "object"]

In [6]:
# check for null values in numeric features
df[numeric_features].isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
dtype: int64

In [7]:
# fill in the missing or null values for numeric features with mean value
for x in numeric_features:
    df[x].fillna(np.mean(df[x]), inplace=True)

In [8]:
# check for null values in categorical features
df[categorical_features].isnull().sum()

Gender           13
Married           3
Dependents       15
Education         0
Self_Employed    32
Property_Area     0
Loan_Status       0
dtype: int64

In [9]:
# fill in the missing or null values for categorical features with highest frequency label
for x in categorical_features:
    df[x].fillna(df[x].value_counts().idxmax(), inplace=True)

In [10]:
# check for null values - make sure it is taken care
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [11]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [12]:
# let us perform normalization on the numeric features
for x in numeric_features:
    df[x] = df[x].apply(lambda s: (s - min(df[x])) / (max(df[x]) - min(df[x])))

In [13]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.19886,0.74359,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,0.74359,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,0.74359,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,0.74359,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,0.74359,1.0,Urban,Y


In [14]:
# let us encode string labels to numeric values
for x in categorical_features:
    # convert the feature to categorical
    df[x] = df[x].astype("category")
    # set new feature name
    new_feature = x + "Code"
    # encode the labels to numeric values
    df[new_feature] = df[x].cat.codes
    
    # create mapping
    mappings = list()
    for i, row in df[[x, new_feature]].iterrows():
        tmp = row[x] + "->" + str(row[new_feature])
        if tmp not in mappings:
            mappings.append(tmp)
    
    # save the mapping for further use
    file = open(x+"_"+new_feature+"_Mapping.txt", mode="w")
    for m in mappings:
        file.writelines(str(m)+"\n")
    file.close()

In [15]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,GenderCode,MarriedCode,DependentsCode,EducationCode,Self_EmployedCode,Property_AreaCode,Loan_StatusCode
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
LP001002,Male,No,0,Graduate,No,0.070489,0.0,0.19886,0.74359,1.0,Urban,Y,1,0,0,0,0,2,1
LP001003,Male,Yes,1,Graduate,No,0.05483,0.036192,0.172214,0.74359,1.0,Rural,N,1,1,1,0,0,0,0
LP001005,Male,Yes,0,Graduate,Yes,0.03525,0.0,0.082489,0.74359,1.0,Urban,Y,1,1,0,0,1,2,1
LP001006,Male,Yes,0,Not Graduate,No,0.030093,0.056592,0.160637,0.74359,1.0,Urban,Y,1,1,0,1,0,2,1
LP001008,Male,No,0,Graduate,No,0.072356,0.0,0.191027,0.74359,1.0,Urban,Y,1,0,0,0,0,2,1


In [16]:
# remove old string label categorical features
for x in categorical_features:
    del df[x]
df.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,GenderCode,MarriedCode,DependentsCode,EducationCode,Self_EmployedCode,Property_AreaCode,Loan_StatusCode
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,0.070489,0.0,0.19886,0.74359,1.0,1,0,0,0,0,2,1
LP001003,0.05483,0.036192,0.172214,0.74359,1.0,1,1,1,0,0,0,0
LP001005,0.03525,0.0,0.082489,0.74359,1.0,1,1,0,0,1,2,1
LP001006,0.030093,0.056592,0.160637,0.74359,1.0,1,1,0,1,0,2,1
LP001008,0.072356,0.0,0.191027,0.74359,1.0,1,0,0,0,0,2,1


In [17]:
# save the cleaned and preprocessed data for modelling
df.to_csv("train.csv")