In [1]:
import numpy as np
import pandas as pd
from typing import List
import pickle


In [2]:
### Helper Functions
def load_data(path):
    """
    ====================================================================
    Load the csv data.
    """
    df = pd.read_csv(path)
    return df 

def cal_outliers(value: str, df: pd.DataFrame) -> List:
    """
    ====================================================================
    Calculate the range of values that are not outliers.
    """    
    q1 = np.percentile(df[value], 25)  # 1st quartile
    q3 = np.percentile(df[value], 75)  # 3rd quartile
    iqr = q3 - q1
    lower = q1 - (1.5 * iqr)
    upper = q3 + (1.5 * iqr)
    val_range = [lower, upper]
    return val_range

def load_estimator() -> 'estimator':
    """
    ====================================================================
    Load the trained model
    """ 
    # load the model
    with open('./model/estimator.pkl', 'rb') as f:
        loaded_estimators = pickle.load(f)
    return loaded_estimators

In [3]:
my_data = load_data('./data/training.csv')
my_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [96]:
def clean_test_data(file: 'csv_file_path') -> 'cleaned_data':
    """
    ====================================================================
    1. Clean the data.
    2. 'Transform' the test data using the training data
    3. Return the cleaned_data.
    """  
    from sklearn.preprocessing import LabelEncoder
    # load the data
    data = load_data(file)
    # drop the Loan_ID
    data = data.drop(columns=['Loan_ID'])
    # split the features into categorical and numerical features
    cat_cols = data.select_dtypes(include='object').columns.to_list()
    num_cols = data.select_dtypes(exclude='object').columns.to_list()

    # impute with the median value
    for col in ['Loan_Amount_Term' , 'LoanAmount', 'Credit_History']:
        mean = data[col].median()
        data[col] = np.where(pd.isna(data[col]), mean, data[col])
    # impute the features with the highest occuring value
    for col in ['Credit_History', 'Self_Employed', 'Dependents', 'Gender', 'Married', 'Property_Area']:
        mode = data[col].mode().values[0]
        data[col] = np.where(pd.isna(data[col]), mode, data[col])
    # split the features into categorical and numerical features
    cat_cols = data.select_dtypes(include='object').columns.to_list()
    num_cols = data.select_dtypes(exclude='object').columns.to_list()

    for col in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']:
        outliers = cal_outliers(col, data)
        # filter out outliers
        data = data.loc[(data[col] > outliers[0]) & (data[col] < outliers[1])]

    # encoders
    le_gender = LabelEncoder()
    le_married = LabelEncoder()
    le_dep = LabelEncoder()
    le_edu = LabelEncoder()
    le_self_emp = LabelEncoder()
    le_pr_ar = LabelEncoder()

    encoders = [le_gender, le_married, le_dep, le_edu, le_self_emp, le_pr_ar]      
    # encode other columns
    for enc, col in zip(encoders, cat_cols):
        data[col] = enc.fit_transform(data[col])

    return data

In [5]:
test_data = clean_test_data('./data/test.csv')

# # save as json
# data_json = test_data.to_json('data.json', orient='records', indent=2)

test_data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2
3,1,1,2,0,0,2340,2546,100.0,360.0,1.0,2
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2


In [6]:
# load estimator
estimator = load_estimator()
clf = estimator.get('clf')

In [7]:
# convert dataframe to numpy array
X = test_data.to_numpy()
X

array([[  1.,   1.,   0., ..., 360.,   1.,   2.],
       [  1.,   1.,   1., ..., 360.,   1.,   2.],
       [  1.,   1.,   2., ..., 360.,   1.,   2.],
       ...,
       [  1.,   1.,   0., ..., 360.,   1.,   2.],
       [  1.,   0.,   0., ..., 360.,   1.,   1.],
       [  1.,   1.,   0., ..., 360.,   1.,   0.]])

In [52]:
# make predictions
pred = clf.predict(X)
# add predictions to dataframe
test_data['Predictions'] = pred
test_data = test_data.reset_index()
test_data.head(8)

Unnamed: 0,index,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predictions
0,0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,1
1,1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,1
2,2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,1
3,3,1,1,2,0,0,2340,2546,100.0,360.0,1.0,2,1
4,4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,1
5,5,1,1,0,1,1,2165,3422,152.0,360.0,1.0,2,1
6,6,0,0,1,1,0,2226,0,59.0,360.0,1.0,1,1
7,7,1,1,2,1,0,3881,0,147.0,360.0,0.0,0,0


In [22]:
(test_data['Predictions'].sum(), len(test_data))

(272, 323)

In [98]:
import json
def save_df_as_json(data: pd.DataFrame, path: 'file path') ->'json_object':
    """
    ====================================================================
    Convert dataframe to json object.
    """
    return data.to_json(path, orient='records', indent=2)

def load_json_data(path: 'file path') -> dict:
    
    """
    ====================================================================
    Load json object.
    """
    with open(path, 'r') as f:
        json_str = json.load(f)
        return json_str

In [25]:
json_data = load_json_data('data.json')

In [27]:
# json_data

In [56]:
# Clean data
test_data = clean_test_data('./data/test.csv')
# convert dataframe to numpy array
X = test_data.to_numpy()

# make predictions
pred = clf.predict(X)
test_data['Predictions'] = pred
test_data = test_data.reset_index()
    
path= 'pred_data.json'   # path
# save data as json
json_data = save_df_as_json(test_data, path)
# load the json file
json_data = load_json_data(path)
json_data

[{'index': 0,
  'Gender': 1,
  'Married': 1,
  'Dependents': 0,
  'Education': 0,
  'Self_Employed': 0,
  'ApplicantIncome': 5720,
  'CoapplicantIncome': 0,
  'LoanAmount': 110.0,
  'Loan_Amount_Term': 360.0,
  'Credit_History': 1.0,
  'Property_Area': 2,
  'Predictions': 1},
 {'index': 1,
  'Gender': 1,
  'Married': 1,
  'Dependents': 1,
  'Education': 0,
  'Self_Employed': 0,
  'ApplicantIncome': 3076,
  'CoapplicantIncome': 1500,
  'LoanAmount': 126.0,
  'Loan_Amount_Term': 360.0,
  'Credit_History': 1.0,
  'Property_Area': 2,
  'Predictions': 1},
 {'index': 2,
  'Gender': 1,
  'Married': 1,
  'Dependents': 2,
  'Education': 0,
  'Self_Employed': 0,
  'ApplicantIncome': 5000,
  'CoapplicantIncome': 1800,
  'LoanAmount': 208.0,
  'Loan_Amount_Term': 360.0,
  'Credit_History': 1.0,
  'Property_Area': 2,
  'Predictions': 1},
 {'index': 3,
  'Gender': 1,
  'Married': 1,
  'Dependents': 2,
  'Education': 0,
  'Self_Employed': 0,
  'ApplicantIncome': 2340,
  'CoapplicantIncome': 2546,
  '

In [40]:
test_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predictions
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,1
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,1
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,1
3,1,1,2,0,0,2340,2546,100.0,360.0,1.0,2,1
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
361,1,1,1,0,0,2269,2167,99.0,360.0,1.0,1,1
362,1,1,3,1,1,4009,1777,113.0,360.0,1.0,2,1
363,1,1,0,0,0,4158,709,115.0,360.0,1.0,2,1
364,1,0,0,0,0,3250,1993,126.0,360.0,1.0,1,1


In [73]:
# load data
test_data = clean_test_data('./data/test.csv')
# convert dataframe to numpy array
X = test_data.to_numpy()
# load the estimator
estimator = load_estimator()
clf = estimator['clf']
# make predictions
pred = clf.predict(X)

test_data['Predictions'] = pred
# add the IDs
id =  get_IDs('./data/test.csv')
final_df = pd.DataFrame()
final_df = pd.concat([final_df, id], axis='columns')
final_df['Predictions'] = pred

final_df

Unnamed: 0,Loan_ID,Predictions
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1
...,...,...
84,LP001450,0
86,LP001455,1
87,LP001466,1
88,LP001471,1


In [101]:
# load data
test_data = clean_test_data('./data/test.csv')
# convert dataframe to numpy array
X = test_data.to_numpy()
# load the estimator
estimator = load_estimator()
clf = estimator['clf']
# make predictions
pred = clf.predict(X)
test_data['Predictions'] = pred
# add the IDs
id =  get_IDs('./data/test.csv')
final_df = pd.DataFrame()
final_df = pd.concat([final_df, id], axis='columns')
final_df['Predictions'] = pred

path= 'pred_data.json'   # path
# save data as json
json_data = save_df_as_json(final_df, path)
# load the json file
json_data = load_json_data(path)
json_data

[{'Loan_ID': 'LP001015', 'Predictions': 1},
 {'Loan_ID': 'LP001022', 'Predictions': 1},
 {'Loan_ID': 'LP001031', 'Predictions': 1},
 {'Loan_ID': 'LP001035', 'Predictions': 1},
 {'Loan_ID': 'LP001051', 'Predictions': 1},
 {'Loan_ID': 'LP001054', 'Predictions': 1},
 {'Loan_ID': 'LP001055', 'Predictions': 1},
 {'Loan_ID': 'LP001056', 'Predictions': 0},
 {'Loan_ID': 'LP001067', 'Predictions': 1},
 {'Loan_ID': 'LP001078', 'Predictions': 1},
 {'Loan_ID': 'LP001082', 'Predictions': 1},
 {'Loan_ID': 'LP001083', 'Predictions': 1},
 {'Loan_ID': 'LP001096', 'Predictions': 1},
 {'Loan_ID': 'LP001099', 'Predictions': 1},
 {'Loan_ID': 'LP001105', 'Predictions': 1},
 {'Loan_ID': 'LP001107', 'Predictions': 1},
 {'Loan_ID': 'LP001115', 'Predictions': 1},
 {'Loan_ID': 'LP001121', 'Predictions': 1},
 {'Loan_ID': 'LP001124', 'Predictions': 1},
 {'Loan_ID': 'LP001128', 'Predictions': 1},
 {'Loan_ID': 'LP001135', 'Predictions': 1},
 {'Loan_ID': 'LP001163', 'Predictions': 1},
 {'Loan_ID': 'LP001174', 'Predic

In [59]:
# load data
test_data = clean_test_data('./data/test.csv')
# load the estimator
estimator = load_estimator()
clf = estimator['clf']
le_gender = estimator['le_gender']
le_married = estimator['le_married']
le_dep = estimator['le_dep']
le_edu = estimator['le_edu']
le_self_emp = estimator['le_self_emp']
le_pr_ar = estimator['le_pr_ar']

encoders = [le_gender, le_married, le_dep, le_edu, le_self_emp, le_pr_ar] 

cols = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
for col, enc in zip(cols, encoders):
    test_data[col] = enc.inverse_transform(test_data[col]) 

In [60]:
['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

[]

In [None]:
final_df = pd.DataFrame()
final_df['Loan_ID']

In [100]:
def get_IDs(file: 'csv_file_path') -> np.array:
    """
    ====================================================================
    1. Obtain the loan IDs
    """  
    from sklearn.preprocessing import LabelEncoder
    # load the data
    data = load_data(file)
    # split the features into categorical and numerical features
    cat_cols = data.select_dtypes(include='object').columns.to_list()
    num_cols = data.select_dtypes(exclude='object').columns.to_list()

    # impute with the median value
    for col in ['Loan_Amount_Term' , 'LoanAmount', 'Credit_History']:
        mean = data[col].median()
        data[col] = np.where(pd.isna(data[col]), mean, data[col])
    # impute the features with the highest occuring value
    for col in ['Credit_History', 'Self_Employed', 'Dependents', 'Gender', 'Married', 'Property_Area']:
        mode = data[col].mode().values[0]
        data[col] = np.where(pd.isna(data[col]), mode, data[col])
    # split the features into categorical and numerical features
    cat_cols = data.select_dtypes(include='object').columns.to_list()
    num_cols = data.select_dtypes(exclude='object').columns.to_list()

    for col in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']:
        outliers = cal_outliers(col, data)
        # filter out outliers
        data = data.loc[(data[col] > outliers[0]) & (data[col] < outliers[1])]

    return data['Loan_ID']

In [70]:
get_IDs('./data/test.csv')

0      LP001015
1      LP001022
2      LP001031
3      LP001035
4      LP001051
         ...   
361    LP002969
362    LP002971
363    LP002975
364    LP002980
365    LP002986
Name: Loan_ID, Length: 323, dtype: object

In [95]:
clean_test_data('./data/test.csv', False)

Series([], Name: Loan_ID, dtype: object)