# Credit Risk — Optimizing Data Types & Dealing with Missing Values
### for CEBD1260: Introduction to Machine Learning
*by Pierre-Olivier Bonin*

# Importing Libraries and Loading the Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
import statsmodels.formula.api as smf
import statsmodels.api as sm
import glob

DATA_PATH = 'D:\\Documents\\Pierre-Olivier\\CEBD1260\\Datasets\\'
all_files = glob.glob(DATA_PATH+'*.csv')
print(f"There are {len(all_files)} files to work with.") # print all files in the data directory

desc_df = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/HomeCredit.csv", encoding="latin1")
desc_df = desc_df.drop("Unnamed: 0", axis=1)
maindf = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/application_train.csv")
df2 = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/bureau.csv")
df3 = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/bureau_balance.csv")
df4 = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/credit_card_balance.csv")
df5 =  pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/installments_payments.csv")
df6 = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/POS_CASH_balance.csv")
df7 = pd.read_csv("D:/Documents/Pierre-Olivier/CEBD1260/Datasets/previous_application.csv")

dflist = [maindf, df2, df3, df4, df5, df6, df7]

desc_df['Table'].unique(), desc_df['Table'].nunique() # but we know that we actually have 7 datasets to work with.
print(f"There are {desc_df['Table'].nunique()} datasets to work with.")

There are 10 files to work with.


# Data Type Optimization

In [2]:
### INSTRUCTIONS ###
# To execute the present script, load the datasets to be cleaned and then put them into a list of dataframes.
# We will call the dataframe list 'dflist', such that you must run first the following:
# dflist = [] <-- here you insert your dataframes and separate with commas. Once you're done, run the script below.


# Print initial memory usage details
mem_sum = 0
for i in dflist:
    print('Memory usage of dataframe is {:.6f} GB'.format(i.memory_usage().sum()/1000000000))
    mem_sum = mem_sum+i.memory_usage().sum()/1000000000
print(f"Total memory used for all dataframes is: {mem_sum:.2f}GB")

# Construct dataframe for reference used below in the optimize_inttypes function.
np_types = [np.int8 ,np.int16 ,np.int32, np.int64,
           np.uint8 ,np.uint16, np.uint32, np.uint64]
np_types = [np_type.__name__ for np_type in np_types]
type_df = pd.DataFrame(data=np_types, columns=['class_type'])
type_df['min_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).min)
type_df['max_value'] = type_df['class_type'].apply(lambda row: np.iinfo(row).max)
type_df['range'] = type_df['max_value'] - type_df['min_value']
type_df.sort_values(by='range', inplace=True)

# Create function to optimize integer data types
def optimize_inttypes(dataframe):
    for col in dataframe.loc[:, dataframe.dtypes <= np.int64]:
        col_min = dataframe[col].min()
        col_max = dataframe[col].max()
        temp = type_df[(type_df['min_value'] <= col_min) & (type_df['max_value'] >= col_max)]
        optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
        print("Col name : {} Col min_value : {} Col max_value : {} Optimized Class : {}".format(col, col_min, col_max, optimized_class))
        dataframe[col] = dataframe[col].astype(optimized_class)

# Run integer optimization over all dataframes.
for i in dflist:
    optimize_inttypes(i)

# Print updated memory usage details
mem_sum = 0
for i in dflist:
    print('Memory usage of dataframe is {:.6f} GB'.format(i.memory_usage().sum()/1000000000))
    mem_sum = mem_sum+i.memory_usage().sum()/1000000000
print(f"Total memory used for all dataframes is: {mem_sum:.2f}GB")

####################################################################################################
####################################################################################################

# Create dataframe for reference, used in the float optimization function further below.
np_types = [np.float16 ,np.float32, np.float64]
np_types = [np_type.__name__ for np_type in np_types]
floattype_df = pd.DataFrame(data=np_types, columns=['class_type'])
floattype_df['min_value'] = floattype_df['class_type'].apply(lambda row: np.finfo(row).min)
floattype_df['max_value'] = floattype_df['class_type'].apply(lambda row: np.finfo(row).max)
floattype_df['range'] = floattype_df['max_value'] - floattype_df['min_value']
floattype_df.sort_values(by='range', inplace=True)

# Create float optimization function.
def optimize_floattypes(dataframe):
    for col in dataframe.loc[:, dataframe.dtypes == np.float64]:
        col_min = dataframe[col].min()
        col_max = dataframe[col].max()
        temp = floattype_df[(floattype_df['min_value'] <= col_min) & (floattype_df['max_value'] >= col_max)]
        optimized_class = temp.loc[temp['range'].idxmin(), 'class_type']
        print("Col name : {} Col min_value : {} Col max_value : {} Optimized Class : {}".format(col, col_min, col_max, optimized_class))
        dataframe[col] = dataframe[col].astype(optimized_class)

# Run float optimization over all dataframes.
for i in dflist:
    optimize_floattypes(i)

# Print updated memory usage details
mem_sum = 0
for i in dflist:
    print('Memory usage of dataframe is {:.6f} GB'.format(i.memory_usage().sum()/1000000000))
    mem_sum = mem_sum+i.memory_usage().sum()/1000000000
print(f"Total memory used for all dataframes is: {mem_sum:.2f}GB")

Memory usage of dataframe is 0.300131 GB
Memory usage of dataframe is 0.233434 GB
Memory usage of dataframe is 0.655198 GB
Memory usage of dataframe is 0.706618 GB
Memory usage of dataframe is 0.870746 GB
Memory usage of dataframe is 0.640087 GB
Memory usage of dataframe is 0.494383 GB
Total memory used for all dataframes is: 3.90GB
Col name : SK_ID_CURR Col min_value : 100002 Col max_value : 456255 Optimized Class : int32
Col name : TARGET Col min_value : 0 Col max_value : 1 Optimized Class : int8
Col name : CNT_CHILDREN Col min_value : 0 Col max_value : 19 Optimized Class : int8
Col name : DAYS_BIRTH Col min_value : -25229 Col max_value : -7489 Optimized Class : int16
Col name : DAYS_EMPLOYED Col min_value : -17912 Col max_value : 365243 Optimized Class : int32
Col name : DAYS_ID_PUBLISH Col min_value : -7197 Col max_value : 0 Optimized Class : int16
Col name : FLAG_MOBIL Col min_value : 0 Col max_value : 1 Optimized Class : int8
Col name : FLAG_EMP_PHONE Col min_value : 0 Col max_va

Col name : COMMONAREA_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : ELEVATORS_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : ENTRANCES_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : FLOORSMAX_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : FLOORSMIN_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : LANDAREA_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : LIVINGAPARTMENTS_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : LIVINGAREA_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : NONLIVINGAPARTMENTS_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : NONLIVINGAREA_AVG Col min_value : 0.0 Col max_value : 1.0 Optimized Class : float16
Col name : APARTMENTS_MODE Col min_value : 0.0 Col max_value : 1.0 O

Col name : AMT_DOWN_PAYMENT Col min_value : -0.9 Col max_value : 3060045.0 Optimized Class : float32
Col name : AMT_GOODS_PRICE Col min_value : 0.0 Col max_value : 6905160.0 Optimized Class : float32
Col name : RATE_DOWN_PAYMENT Col min_value : -1.4978763414307848e-05 Col max_value : 1.0 Optimized Class : float16
Col name : RATE_INTEREST_PRIMARY Col min_value : 0.0347812535418791 Col max_value : 1.0 Optimized Class : float16
Col name : RATE_INTEREST_PRIVILEGED Col min_value : 0.3731501057082452 Col max_value : 1.0 Optimized Class : float16
Col name : CNT_PAYMENT Col min_value : 0.0 Col max_value : 84.0 Optimized Class : float16
Col name : DAYS_FIRST_DRAWING Col min_value : -2922.0 Col max_value : 365243.0 Optimized Class : float32
Col name : DAYS_FIRST_DUE Col min_value : -2892.0 Col max_value : 365243.0 Optimized Class : float32
Col name : DAYS_LAST_DUE_1ST_VERSION Col min_value : -2801.0 Col max_value : 365243.0 Optimized Class : float32
Col name : DAYS_LAST_DUE Col min_value : -2889

# Preprocessing Column Identifier

In [8]:
### INSTRUCTIONS ###
# To execute the present script, load the datasets to be cleaned and then put them into a list of dataframes.
# We will call the dataframe list 'dflist', such that you must run first the following:
# dflist = [] <-- here you insert your dataframes and separate with commas. Once you're done, run the script below.

preprocessing_cols = []
for df in dflist:
    non_objectCols = []
    for i,j in zip(df.dtypes.index, df.dtypes.values):
        if j!="object":
            non_objectCols.append(i)
    
    null_cols = []
    for i, j in zip(df.isnull().sum().index, df.isnull().sum().values):
        if j>0:
            null_cols.append(i)
    
    non_BooleanCols = []
    for i, j in zip(df.nunique().index, df.nunique().values):
        if j>2:
            non_BooleanCols.append(i)

    preprocessing_cols.append(list(set(non_objectCols).intersection(null_cols,non_BooleanCols)))



> think about packing this into a function (def)



In [9]:
preprocessing_cols

[['LIVINGAPARTMENTS_AVG',
  'CNT_FAM_MEMBERS',
  'YEARS_BUILD_MEDI',
  'AMT_REQ_CREDIT_BUREAU_DAY',
  'BASEMENTAREA_MODE',
  'FLOORSMAX_MEDI',
  'FLOORSMIN_MEDI',
  'FLOORSMAX_AVG',
  'COMMONAREA_MEDI',
  'YEARS_BEGINEXPLUATATION_AVG',
  'APARTMENTS_MEDI',
  'ELEVATORS_AVG',
  'NONLIVINGAREA_AVG',
  'LANDAREA_MODE',
  'AMT_ANNUITY',
  'FLOORSMAX_MODE',
  'NONLIVINGAPARTMENTS_MODE',
  'AMT_REQ_CREDIT_BUREAU_HOUR',
  'LIVINGAREA_AVG',
  'AMT_GOODS_PRICE',
  'NONLIVINGAREA_MEDI',
  'COMMONAREA_MODE',
  'DEF_60_CNT_SOCIAL_CIRCLE',
  'EXT_SOURCE_2',
  'FLOORSMIN_AVG',
  'AMT_REQ_CREDIT_BUREAU_WEEK',
  'ELEVATORS_MEDI',
  'AMT_REQ_CREDIT_BUREAU_YEAR',
  'APARTMENTS_AVG',
  'OBS_60_CNT_SOCIAL_CIRCLE',
  'LIVINGAREA_MEDI',
  'EXT_SOURCE_3',
  'TOTALAREA_MODE',
  'YEARS_BUILD_MODE',
  'LIVINGAPARTMENTS_MEDI',
  'LIVINGAREA_MODE',
  'EXT_SOURCE_1',
  'ENTRANCES_AVG',
  'ENTRANCES_MODE',
  'APARTMENTS_MODE',
  'AMT_REQ_CREDIT_BUREAU_QRT',
  'NONLIVINGAREA_MODE',
  'YEARS_BEGINEXPLUATATION_MEDI',


feature engineering as another function