In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [37]:
loan_df = pd.read_csv('/home/student/Machine Learning/previous_application.csv', nrows=10000)

In [38]:
loan_df.head()

Unnamed: 0.1,Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [39]:
# Define the new column names in lowercase
new_column_names = {
    'SK_ID_PREV': 'previous_application_ID',
    'SK_ID_CURR': 'current_application_ID',
    'NAME_CONTRACT_TYPE': 'contract_type',
    'AMT_ANNUITY': 'annuity_amount',
    'AMT_APPLICATION': 'application_amount',
    'AMT_CREDIT': 'credit_amount',
    'AMT_DOWN_PAYMENT': 'down_payment_amount',
    'AMT_GOODS_PRICE': 'goods_price_amount',
    'WEEKDAY_APPR_PROCESS_START': 'weekday_application_started',
    'HOUR_APPR_PROCESS_START': 'hour_application_started',
    'FLAG_LAST_APPL_PER_CONTRACT': 'flag_last_application_per_contract',
    'NFLAG_LAST_APPL_IN_DAY': 'flag_last_application_in_day',
    'RATE_DOWN_PAYMENT': 'down_payment_rate',
    'RATE_INTEREST_PRIMARY': 'interest_rate_primary',
    'RATE_INTEREST_PRIVILEGED': 'interest_rate_privileged',
    'NAME_CASH_LOAN_PURPOSE': 'cash_loan_purpose',
    'NAME_CONTRACT_STATUS': 'contract_status',
    'DAYS_DECISION': 'days_decision',
    'NAME_PAYMENT_TYPE': 'payment_type',
    'CODE_REJECT_REASON': 'reject_reason_code',
    'NAME_TYPE_SUITE': 'type_suite',
    'NAME_CLIENT_TYPE': 'client_type',
    'NAME_GOODS_CATEGORY': 'goods_category',
    'NAME_PORTFOLIO': 'portfolio_name',
    'NAME_PRODUCT_TYPE': 'product_type',
    'CHANNEL_TYPE': 'channel_type',
    'SELLERPLACE_AREA': 'sellerplace_area',
    'NAME_SELLER_INDUSTRY': 'seller_industry',
    'CNT_PAYMENT': 'payment_count',
    'NAME_YIELD_GROUP': 'yield_group',
    'PRODUCT_COMBINATION': 'product_combination',
    'DAYS_FIRST_DRAWING': 'days_first_drawing',
    'DAYS_FIRST_DUE': 'days_first_due',
    'DAYS_LAST_DUE_1ST_VERSION': 'days_last_due_1st_version',
    'DAYS_LAST_DUE': 'days_last_due',
    'DAYS_TERMINATION': 'days_termination',
    'NFLAG_INSURED_ON_APPROVAL': 'insured_on_approval_flag'
}

# Rename the columns to lowercase
loan_df.rename(columns=new_column_names, inplace=True)

# Display the updated DataFrame
loan_df.head()

Unnamed: 0.1,Unnamed: 0,previous_application_ID,current_application_ID,contract_type,annuity_amount,application_amount,credit_amount,down_payment_amount,goods_price_amount,weekday_application_started,hour_application_started,flag_last_application_per_contract,flag_last_application_in_day,down_payment_rate,interest_rate_primary,interest_rate_privileged,cash_loan_purpose,contract_status,days_decision,payment_type,reject_reason_code,type_suite,client_type,goods_category,portfolio_name,product_type,channel_type,sellerplace_area,seller_industry,payment_count,yield_group,product_combination,days_first_drawing,days_first_due,days_last_due_1st_version,days_last_due,days_termination,insured_on_approval_flag
0,0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [40]:
# Fill missing values using mode for categorical columns
for col in loan_df.select_dtypes(include=['object']).columns:
    mode_value = loan_df[col].mode()[0]  # Get the mode
    loan_df[col] = loan_df[col].fillna(mode_value)  # Fill missing values with mode

In [41]:
# Filling numerical columns with 25th percentile
numerical_cols = loan_df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if loan_df[col].isnull().any():
        percentile_25 = loan_df[col].quantile(0.25)
        loan_df[col] = loan_df[col].fillna(percentile_25)

In [42]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = loan_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
print(missing_columns)

Series([], dtype: int64)


In [46]:
# Identify numerical columns
numerical_cols = loan_df.select_dtypes(include=['float64', 'int64']).columns

# Standardize the numerical columns
scaler = StandardScaler()
loan_df[numerical_cols] = scaler.fit_transform(loan_df[numerical_cols])

# Show updated DataFrame
# print(loan_df.head())