In [1]:
# fire up the packages
import pandas as pd
import numpy as np

In [2]:
# load the relevant dataset from the repo
df = pd.read_csv("../datasets/loan-defaulter-dataset.csv", header=0)
df = df.set_index("Loan_ID")
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# get the column/featire names and their types
features = df.columns

print("Column_Name\tData_Type")
for x in features:
    print(x, "------>", df[x].dtype)

Column_Name	Data_Type
Gender ------> object
Married ------> object
Dependents ------> object
Education ------> object
Self_Employed ------> object
ApplicantIncome ------> int64
CoapplicantIncome ------> float64
LoanAmount ------> float64
Loan_Amount_Term ------> float64
Credit_History ------> float64
Property_Area ------> object
Loan_Status ------> object


In [4]:
# get the list of catagorical features
catagorical_features = df.dtypes.loc[df.dtypes == "object"].index
print("Catagorical Feature List =", catagorical_features)

# get the list of continuous features
continuous_features = df.dtypes.loc[df.dtypes != "object"].index
print("Continuous Feature List =", continuous_features)

Catagorical Feature List = Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area', 'Loan_Status'],
      dtype='object')
Continuous Feature List = Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')


In [5]:
# another simpler approach
catagorical_features1 = list()
continuous_features1 = list()

for x in features:
    if df[x].dtype == "object":
        catagorical_features1.append(x)
    else:
        continuous_features1.append(x)

print("Catagorical Feature List =", catagorical_features1)
print("Continuous Feature List =", continuous_features1)

Catagorical Feature List = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
Continuous Feature List = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [6]:
# let's find unique values for each of these catagorical features
df[catagorical_features].apply(lambda x: len(x.unique()))

Gender           3
Married          3
Dependents       5
Education        2
Self_Employed    3
Property_Area    3
Loan_Status      2
dtype: int64

In [7]:
# let's get the count of each unique value
df["Gender"].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [8]:
for x in catagorical_features:
    print(df[x].value_counts())

Male      489
Female    112
Name: Gender, dtype: int64
Yes    398
No     213
Name: Married, dtype: int64
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
No     500
Yes     82
Name: Self_Employed, dtype: int64
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64
Y    422
N    192
Name: Loan_Status, dtype: int64


In [9]:
# let's get the percentage count of each unique values w.r.t the whole dataset

for x in ["Gender", "Education"]:
    print(100*df[x].value_counts()/df.shape[0])

Male      79.641694
Female    18.241042
Name: Gender, dtype: float64
Graduate        78.175896
Not Graduate    21.824104
Name: Education, dtype: float64


In [10]:
# check for missing or null values on the whole dataframe
df.apply(lambda x: sum(x.isnull()))

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
# get the value with max counts in the column
print(df["Gender"].value_counts().idxmax())
print(df["Education"].value_counts().idxmax())

Male
Graduate


In [12]:
# fill in the catagorical features wherever it is missing or null
df["Gender"].fillna(df["Gender"].value_counts().idxmax(), inplace=True)
df["Married"].fillna(df["Married"].value_counts().idxmax(), inplace=True)
df["Self_Employed"].fillna(df["Self_Employed"].value_counts().idxmax(), inplace=True)
df["Dependents"].fillna(df["Dependents"].value_counts().idxmax(), inplace=True)

print(df.apply(lambda x: sum(x.isnull())))

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [13]:
np.mean(df["LoanAmount"])

146.41216216216216

In [14]:
# fill in the continuous fearures missing values
df["LoanAmount"].fillna(np.mean(df["LoanAmount"]), inplace=True)
df["Loan_Amount_Term"].fillna(np.mean(df["Loan_Amount_Term"]), inplace=True)
df["Credit_History"].fillna(np.mean(df["Credit_History"]), inplace=True)

print(df.apply(lambda x: sum(x.isnull())))

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [15]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [16]:
# label transformation for encoding the catgorical values into numeric form
from sklearn.preprocessing import LabelEncoder

In [17]:
# create an object
labelEncode = LabelEncoder()

# apply the encoder for each catagorical_features
for x in catagorical_features:
    df[x] = labelEncode.fit_transform(df[x])

df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [18]:
# now its time to normalize the data

for x in continuous_features:
    df[x] = df[x]/max(df[x])

df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0,0,0,0.07221,0.0,0.20916,0.75,1.0,2,1
LP001003,1,1,1,0,0,0.05658,0.036192,0.182857,0.75,1.0,0,0
LP001005,1,1,0,0,1,0.037037,0.0,0.094286,0.75,1.0,2,1
LP001006,1,1,0,1,0,0.031889,0.056592,0.171429,0.75,1.0,2,1
LP001008,1,0,0,0,0,0.074074,0.0,0.201429,0.75,1.0,2,1


In [19]:
# lets wirte the processed data to a csv file for further analysis
df.to_csv("../datasets/cleaned-loan-defaulter-dataset.csv")