# Initialization

In [1]:
import csv
import sklearn
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, linear_model, model_selection, metrics

def df_summ(df):
    result = pd.DataFrame()
    
    result['Column'] = df.columns
    result['Type'] = df.dtypes.values
    result['Missing'] = df.isna().sum().values
    result['Missing (%)'] = result['Missing']*100/len(df)
    result['Unique'] = df.nunique().values
    result['Unique (%)'] = result['Unique']*100/len(df)
    
    return result

# Data Preparation
All of this section is commented because it was performed only once and no need to rerun it again to avoid loading/writing from the original loan dataset since it is very big

## Dataset filtering and preliminary feature selection

In [None]:
# class LoanDataAnalysis:
#     def __init__(self, file_path):
#         self.file_path = file_path
#         self.df = pd.read_csv(file_path)

#     def display_info(self):
#         print(self.df.head())
#         numeric_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
#         categorical_cols = self.df.select_dtypes(include=['object']).columns
#         print("\nNumber of numeric columns:", len(numeric_cols))
#         print("Number of categorical columns:", len(categorical_cols))

#     def filter_data(self, save_path=None):
#         self.df['issue_d'] = pd.to_datetime(self.df['issue_d'])
#         filtered_df = self.df[(self.df['issue_d'].dt.year >= 2016) &
#                               (self.df['loan_status'].isin(['Fully Paid', 'Charged Off']))]
#         if save_path:
#             filtered_df.to_csv(save_path, index=False)
#             print("Filtered dataset saved as '{}'".format(save_path))
#         return filtered_df

#     def split_data(self, filtered_df):
#         train_set = filtered_df[filtered_df['issue_d'].dt.year.isin([2016, 2017])]
#         test_set = filtered_df[filtered_df['issue_d'].dt.year == 2018]
#         return train_set, test_set

#     def select_columns(self, filtered_df, columns_to_keep):
#         selected_df = filtered_df[columns_to_keep]
#         return selected_df

#     def save_selected_data(self, selected_df, save_path):
#         selected_df.to_csv(save_path, index=False)
#         print("Selected dataset saved as '{}'".format(save_path))

#     def plot_missing_values_heatmap(self, df, title, save_path=None):
#         plt.figure(figsize=(10, 6))
#         sns.heatmap(df.isnull(), cmap='binary', cbar=False)
#         plt.yticks(rotation=0)
#         plt.title(title)
#         plt.xlabel('Columns')
#         plt.ylabel('Rows')
#         plt.tight_layout()
#         if save_path:
#             plt.savefig(save_path)
#         plt.show()


# # Example usage:
# file_path = "/Users/user/PycharmProjects/pythonProject2/loan.csv"
# save_filtered_path = "filtered_loan_data.csv"
# save_selected_path = "/Users/user/PycharmProjects/pythonProject2/selected_loan_data.csv"

# columns_to_keep = ['acc_now_delinq', 'acc_open_past_24mths', 'addr_state', 'all_util', 'annual_inc', 'application_type',
#     'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'collections_12_mths_ex_med',
#     'delinq_2yrs', 'delinq_amnt', 'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 'funded_amnt',
#     'funded_amnt_inv', 'grade', 'home_ownership', 'il_util', 'initial_list_status', 'inq_last_12m',
#     'installment', 'int_rate', 'issue_d', 'last_credit_pull_d', 'loan_amnt', 'loan_status', 'max_bal_bc',
#     'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_last_delinq', 'mths_since_last_major_derog',
#     'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_sats', 'num_tl_120dpd_2m',
#     'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'open_act_il', 'pct_tl_nvr_dlq', 'percent_bc_gt_75',
#     'pub_rec', 'pub_rec_bankruptcies', 'revol_bal', 'revol_util', 'sub_grade', 'tax_liens', 'term',
#     'tot_cur_bal', 'tot_hi_cred_lim', 'total_bc_limit', 'total_il_high_credit_limit', 'total_rev_hi_lim',
#     'verification_status', 'debt_settlement_flag']

# loan_analysis = LoanDataAnalysis(file_path)
# loan_analysis.display_info()
# filtered_data = loan_analysis.filter_data(save_path=save_filtered_path)
# train_set, test_set = loan_analysis.split_data(filtered_data)
# print("Number of instances in train_set:", len(train_set))
# print("Number of instances in test_set:", len(test_set))
# selected_data = loan_analysis.select_columns(filtered_data, columns_to_keep)
# loan_analysis.save_selected_data(selected_data, save_selected_path)

## Feature Transformation and Data Splitting

In [None]:
# # Combining related but sparse features
# df = pd.read_csv("C:\\Users\\LENOVO\\Downloads\\selected_loan_data.csv")

# df.loc[df['application_type'] == "Joint App", "annual_inc"] = (df["annual_inc"] + df["annual_inc_joint"])/2
# df.loc[df['application_type'] == "Joint App", "dti"] = (df["dti"] + df["dti_joint"])/2

# df["mths_sin_rcnt_acct"] = np.min([df["mo_sin_old_il_acct"].values,
#                           df["mo_sin_old_rev_tl_op"].values,
#                           df["mo_sin_rcnt_rev_tl_op"].values,
#                           df["mo_sin_rcnt_tl"].values], axis = 0)


# # Converting datetime features into units of months

# df["last_credit_pull_d"] = pd.to_datetime(df["last_credit_pull_d"], format = "%b-%Y")
# df["issue_d"] = pd.to_datetime(df["issue_d"], format = "%Y-%m-%d")
# df["mths_sin_credit_pull"] = df["issue_d"].dt.to_period('M').astype("int64") - df["last_credit_pull_d"].dt.to_period('M').astype("int64")
# df.loc[df['mths_sin_credit_pull'] < 0, "mths_sin_credit_pull"] = 0

# df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"], format = "%b-%Y")
# df["mths_sin_earliest_cr"] = df["issue_d"].dt.to_period('M').astype("int64") - df["earliest_cr_line"].dt.to_period('M').astype("int64")


# # Dropping irrelevant columns

# df.drop(columns = ["last_pymnt_amnt",
#                    "last_pymnt_d",
#                    "total_pymnt",
#                    "total_pymnt_inv",
#                    'total_rec_prncp', 
#                    'total_rec_int',
#                    'total_rec_late_fee',
#                    "debt_settlement_flag",
#                    "collection_recovery_fee",
#                    "annual_inc_joint",
#                    "dti_joint",
#                    "inq_fi", 
#                    "inq_last_6mths",
#                    "mo_sin_old_il_acct", 
#                    "mo_sin_old_rev_tl_op",
#                    "mo_sin_rcnt_rev_tl_op",
#                    "mo_sin_rcnt_tl",
#                    "last_credit_pull_d",
#                    "earliest_cr_line",
#                    "mths_since_last_delinq",
#                    "mths_since_last_major_derog"
#                   ],
#         inplace = True
#        )

# # Split into train and test

# test = df.loc[df["issue_d"].dt.year >= 2018]
# train = df.loc[df["issue_d"].dt.year < 2018]

# print(train.shape)
# print(test.shape)

# #Check missing values in train dataset
# summary_df = df_summ(train)
# display(summary_df[summary_df["Missing (%)"] > 0])

## Recategorizing emp_title into 7 broader categories

In [None]:
## Recategorize emp_title into 7 broader categories

# categories = {"Education": 
#               [
#                   "teacher", "lecture", "professor", "trainer", "educa", "training", "postdoctor", "research",
#                             "scien", "bio", "physic", "math", "faculty", "chemist", "dean"
#               ],
#               "Corporate":
#               [
#                   "manag", "director", "president", "CEO", "exec", "general", "supervisor", "head", "principal", "super",
#                   "chief", "directer", "partner", "VP", "CFO", "GM", "team lead","data", "engineer", "programmer",
#                   "technology", "analyst", "IT", "tech", "dev", "web", "account", "consultant", "attorney", "lawyer", "legal",
#                   "sales", "marketing", "law", "human", "HR", "human resource", "tax", "auditor", "recruit", "office", "asso",
#                   "compliance", "advis", "custod", "financ", "estimator", "deal", "planner", "CPA", "CFA", "special", "bank",
#                   "trade", "trading", "mgr", "code"
#               ],
#               "Business Owner":
#               [
#                   "owner", "self", "investor", "founder"
#               ],
#               "Healthcare-Civil":
#               [
#                   "nurs", "doctor", "health", "pharma", "RN", "LPN", "LVN", "CNA", "clinic", "care", "drug", "counsel",
#                   "therap", "medic", "dent", "psyc", "police", "sergeant", "lieutenant", "detective", "deputy", "sheriff",
#                   "fire", "inspect", "investigat", "captain", "EMT", "paraprof", "phlebotom", "soldier", "army", "navy",
#                   "troop", "judge", "minist", "vet", "patrol"
#               ],
#               "Admin-Operator":
#               [
#                   "driver", "operator", "controller", "electrician", "server", "mechanic", "operation", "technician",
#                   "foreman", "forman", "coordinator", "service", "maintenance", "door", "plumber", "labor", "worker",
#                   "agent", "weld", "machini", "lead", "carpenter", "dispatch", "warehouse", "courier", "assembl", "install",
#                   "cler", "produc", "carrier", "handler", "process", "logis", "ship", "post", "port", "util", "contractor",
#                   "keep", "admin", "staff", "support", "administration", "assistant", "administrative", "real", "secre",
#                   "cash", "recept", "teller", "broker", "buyer", "pay", "merchan", "adjus", "purchas", "regist", "sched",
#                   "conduct", "lineman", "instruct", "packer", "receiver", "desk", "bill", "clean", "load", "stocker",
#                   "pressman", "picker", "examiner", "manufactur", "drill"
#               ],
#               "Tourism-Entertainment":
#               [
#                   "art", "design", "content", "music", "cast", "paint", "styl", "libr", "pastor", "relig", "nun", "comm",
#                   "csr", "bartender", "cook", "chef", "pilot", "flight", "bake", "optic", "nanny", "baby", "help",
#                   "collect", "barista", "esthetic", "barber", "dress", "hair", "salon", "coach"
#               ],
#               "Others": []}

# def categorize_job(df):
#     result = []
#     for job in df["emp_title"]:
#         flag = 0
#         if flag == 0:
#             for i in categories.keys():
#                 if flag == 0 and i!="Others":
#                     for j in categories[i]:
#                         if flag == 0:
#                             if (str(j).lower() in str(job).lower()):
#                                 flag = 1
#                                 result.append(i)
#                                 break
#                 elif flag == 0 and i == "Others":
#                     flag = 1
#                     result.append(i)
#                     break
    
#     df["job_category"] = result
#     df.loc[(df["emp_title"] == "CTO") | (df["emp_title"] == "COO")| (df["emp_title"] == "CIO"), "job_category"] = "Corporate"
#     df.loc[(df["emp_title"].isna() == True ), "job_category"] = np.NaN
#     return df

# train = categorize_job(train)
# test = categorize_job(test)

# # Write the imputed DataFrame to a new CSV file
# train.to_csv("train.csv", index=False)
# test.to_csv("test.csv", index=False)

# Exploratory Data Analysis

## Missing values heatmap

In [None]:
# loan_analysis.plot_missing_values_heatmap(loan_analysis.df, 'Missing Values Heatmap For Raw Data',
#                                           'missing_values_heatmap_raw.png')
# loan_analysis.plot_missing_values_heatmap(filtered_data, 'Missing Values Heatmap For Filtered Data',
#                                           'missing_values_heatmap_filtered.png')
# loan_analysis.plot_missing_values_heatmap(selected_data, 'Missing Values Heatmap For Selected Data',
#                                           'missing_values_heatmap_selected.png')

## Features Correlation with Loan Status

In [None]:
# def load_data(file_path):
#     return pd.read_csv(file_path)
# 
# def filter_numeric_columns(df):
#     return df.select_dtypes(include=['int', 'float'])
# 
# def filter_categorical_columns(df):
#     return df.select_dtypes(include=['object'])
# 
# def compute_correlation_matrix(df):
#     return df.corr()
# 
# def plot_heatmap(correlation_matrix, output_filename=None):
#     plt.figure(figsize=(12, 12))
#     sns.heatmap(correlation_matrix, cmap='coolwarm', fmt=".2f")
#     plt.title('Correlation Heatmap')
#     if output_filename:
#         plt.savefig(output_filename)
#     plt.show()
# 
# def find_highly_correlated_variables(correlation_matrix, threshold=0.7):
#     highly_correlated_pairs = (correlation_matrix.abs() > threshold) & (correlation_matrix != 1)
#     return highly_correlated_pairs
# 
# def plot_highly_correlated_heatmap(correlation_matrix, highly_correlated_pairs, output_filename=None):
#     annot_kws = {"fontsize": 10, "color": 'black'}
#     plt.figure(figsize=(12, 12))
#     sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=~highly_correlated_pairs, annot_kws=annot_kws)
#     plt.title('Correlation Heatmap (Highly Correlated Pairs)')
#     if output_filename:
#         plt.savefig(output_filename)
#     plt.show()
# 
# def visualize_relationship_categorical(df, column, loan_status_column='loan_status', output_filename=None):
#     counts = df.groupby([column, loan_status_column]).size().unstack(fill_value=0)
#     plt.figure(figsize=(10, 6))
#     counts.plot(kind='bar', stacked=True, color=['blue', 'red'])
#     plt.title(f'Relationship between {column} and {loan_status_column}')
#     plt.xlabel(column)
#     plt.ylabel('Count')
#     plt.xticks(rotation=90)
#     plt.legend(title=loan_status_column)
#     plt.tight_layout()
#     if output_filename:
#         plt.savefig(output_filename)
#     plt.show()
# 
# def visualize_emp_length_relationship(df, loan_status_column='loan_status', output_filename=None):
#     plt.figure(figsize=(10, 6))
#     counts_emp = df.groupby(['emp_length', loan_status_column]).size().unstack(fill_value=0)
#     emp = counts_emp.reindex(['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'])
#     emp.plot(kind='bar', stacked=True, color=['blue', 'red'])
#     plt.title(f'Relationship between emp_length and {loan_status_column}')
#     plt.xlabel('emp_length')
#     plt.ylabel('Count')
#     plt.xticks(rotation=90)
#     plt.legend(title=loan_status_column)
#     plt.tight_layout()
#     if output_filename:
#         plt.savefig(output_filename)
#     plt.show()
# 
# def visualize_relationship_numerical(df, column, loan_status_column='loan_status', output_filename=None):
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(x=loan_status_column, y=column, data=df, hue=loan_status_column, palette={'Fully Paid': 'blue', 'Charged Off': 'red'})
#     plt.title(f'Relationship between {column} and {loan_status_column}')
#     plt.xlabel(loan_status_column)
#     plt.ylabel(column)
#     plt.legend(title=loan_status_column)
#     plt.tight_layout()
#     if output_filename:
#         plt.savefig(output_filename)
#     plt.show()
# 
# def categorize_columns(df, loan_status_column='loan_status'):
#     variable_columns = [col for col in df.columns if col != loan_status_column]
#     categorical_columns = [col for col in variable_columns if df[col].dtype == 'object']
#     numeric_columns = [col for col in variable_columns if col not in categorical_columns]
#     return categorical_columns, numeric_columns
# 
# def main():
#     # Load data
#     df = load_data('train.csv')
# 
#     # Categorize columns as categorical or numeric
#     categorical_columns, numeric_columns = categorize_columns(df)
# 
#     # Filter numeric and categorical columns
#     numeric_df = df[numeric_columns]
#     categorical_df = df[categorical_columns]
# 
#     # Compute correlation matrix
#     correlation_matrix = compute_correlation_matrix(numeric_df)
# 
#     # Plot heatmap
#     plot_heatmap(correlation_matrix, 'correlation_heatmap.png')
# 
#     # Find highly correlated variables
#     highly_correlated_pairs = find_highly_correlated_variables(correlation_matrix)
# 
#     # Plot highly correlated heatmap
#     plot_highly_correlated_heatmap(correlation_matrix, highly_correlated_pairs, 'correlation_heatmap_highly_correlated.png')
# 
#     # Visualize relationship between categorical variables and loan status
#     for column in categorical_df.columns:
#         visualize_relationship_categorical(df, column, output_filename=f'{column}_grouped_bar_chart.png')
# 
#     # Visualize relationship between numerical variables and loan status
#     for column in numeric_df.columns:
#         visualize_relationship_numerical(df, column, output_filename=f'{column}_box_plot.png')
# 
#     # Visualize relationship between emp_length and loan status
#     visualize_emp_length_relationship(df, output_filename='emp_length_grouped_bar_chart.png')
# 
# main()

# Missing Values Imputation, Ordinal Encoding, and Feature Selection via Correlation Analysis

This section is done on the R program, please kindly check the ".R" file that returns "train_vs1.csv" and "test_vs1.csv"

# Information
This section is preserved for general information of the train set

In [2]:
train = pd.read_csv("./Data/train_vs1.csv")
test = pd.read_csv("./Data/test_vs1.csv")

In [3]:
num_var = train.select_dtypes(include=[np.number]).columns
cat_var = train.select_dtypes(include=[object, "datetime"]).columns

print("Number of numerical variable: {} variables".format(len(num_var)))
print("Number of categorical variable: {} variables".format(len(cat_var)))
print("Training set: {} variables with {} rows".format(train.shape[1], train.shape[0]))

train[num_var].describe()

Number of numerical variable: 37 variables
Number of categorical variable: 8 variables
Training set: 45 variables with 433599 rows


Unnamed: 0,loan_amnt,int_rate,sub_grade,emp_length,annual_inc,dti,delinq_2yrs,pub_rec,revol_bal,collections_12_mths_ex_med,...,num_tl_90g_dpd_24m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens,total_bc_limit,total_il_high_credit_limit,mths_sin_rcnt_acct,mths_sin_credit_pull,mths_sin_earliest_cr,issue_month
count,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,...,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0,433599.0
mean,14444.773512,13.473746,11.793254,5.543832,80021.05,18.543216,0.338449,0.257035,15942.66,0.022113,...,0.091986,93.875969,0.158298,0.06441,22518.26,45265.96,7.402268,0.002717,195.809192,5.885768
std,9115.066337,5.292474,6.437473,3.903071,78144.61,10.045419,0.923917,0.6535,23149.6,0.168261,...,0.534835,9.100093,0.408063,0.435635,22296.37,45010.57,7.711629,0.052273,90.807568,3.438549
min,1000.0,5.32,1.0,0.0,2400.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,1.0
25%,7200.0,9.75,7.0,2.0,49000.0,12.01,0.0,0.0,5535.0,0.0,...,0.0,90.9,0.0,0.0,8200.0,16500.0,3.0,0.0,135.0,3.0
50%,12000.0,12.74,11.0,5.0,68000.0,17.93,0.0,0.0,10524.0,0.0,...,0.0,97.5,0.0,0.0,15900.0,34453.0,5.0,0.0,177.0,6.0
75%,20000.0,16.02,15.0,10.0,95000.0,24.53,0.0,0.0,18879.0,0.0,...,0.0,100.0,0.0,0.0,29200.0,60658.5,9.0,0.0,242.0,9.0
max,40000.0,30.99,35.0,10.0,10999200.0,515.49,29.0,61.0,1696796.0,12.0,...,29.0,100.0,8.0,61.0,1105500.0,2000000.0,169.0,2.0,999.0,12.0


In [22]:
display(train[cat_var].describe())

Unnamed: 0,term,home_ownership,verification_status,loan_status,addr_state,initial_list_status,application_type,job_category
count,404291,404291,404291,404291,404291,404291,404291,404291
unique,2,5,3,2,50,2,2,7
top,36 months,MORTGAGE,Source Verified,Fully Paid,CA,w,Individual,Corporate
freq,311244,200515,172860,310783,57591,306433,389680,234276


In [23]:
for i in cat_var:
    print("The {} variable has {} unique category, which are:".format(i, len(train[i].unique())))
    print(train[i].value_counts(dropna = False), "\n")

The term variable has 2 unique category, which are:
term
36 months    311244
60 months     93047
Name: count, dtype: int64 

The home_ownership variable has 5 unique category, which are:
home_ownership
MORTGAGE    200515
RENT        157151
OWN          46400
ANY            223
NONE             2
Name: count, dtype: int64 

The verification_status variable has 3 unique category, which are:
verification_status
Source Verified    172860
Not Verified       121937
Verified           109494
Name: count, dtype: int64 

The loan_status variable has 2 unique category, which are:
loan_status
Fully Paid     310783
Charged Off     93508
Name: count, dtype: int64 

The addr_state variable has 50 unique category, which are:
addr_state
CA    57591
TX    34853
NY    31857
FL    29063
IL    15053
NJ    14038
PA    12990
OH    12937
GA    12877
NC    11487
VA    10779
MI    10433
AZ    10297
CO     9421
MA     9402
MD     9272
WA     8461
MN     7360
IN     7196
TN     6630
NV     6259
MO     6246
CT   

In [24]:
print(num_var)
print(cat_var)

Index(['loan_amnt', 'int_rate', 'sub_grade', 'emp_length', 'annual_inc', 'dti',
       'delinq_2yrs', 'pub_rec', 'revol_bal', 'collections_12_mths_ex_med',
       'acc_now_delinq', 'tot_cur_bal', 'open_act_il', 'il_util', 'max_bal_bc',
       'all_util', 'inq_last_12m', 'acc_open_past_24mths', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mort_acc',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_sats',
       'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
       'pct_tl_nvr_dlq', 'pub_rec_bankruptcies', 'tax_liens', 'total_bc_limit',
       'total_il_high_credit_limit', 'mths_sin_rcnt_acct',
       'mths_sin_credit_pull', 'mths_sin_earliest_cr', 'issue_month'],
      dtype='object')
Index(['term', 'home_ownership', 'verification_status', 'loan_status',
       'addr_state', 'initial_list_status', 'application_type',
       'job_category'],
      dtype='object')


# Model Building

## Logistic Regression

In [2]:
train = pd.read_csv("./Data/train_vs1.csv")
test = pd.read_csv("./Data/test_vs1.csv")

In [3]:
def transform_features(df, encoder, scaler, train = True):
    df.dropna(inplace = True)
    
    # Defining X and Y
    X_train = df.drop(columns = "loan_status")
    y_train = pd.DataFrame(df["loan_status"])
    num_var = X_train.select_dtypes(include=[np.number]).columns
    cat_var = X_train.select_dtypes(include=[object, "datetime"]).columns
    
    # Scaling numerical variables and encoding categorical variables
    if train:
        scaler.fit(X_train[num_var])
        encoder.fit(X_train[cat_var])
    X_train[num_var] = scaler.transform(X_train[num_var])
    
    # Concatenating encoded categorical variables and dropping original categorical variables
    temp = encoder.transform(X_train[cat_var])
    temp = pd.DataFrame(temp.toarray(), columns = encoder.get_feature_names_out(cat_var))
    X_train.drop(columns = cat_var, inplace = True)
    X_train = pd.concat([X_train.reset_index(), temp.reset_index()], axis = 1)
    
    # Encoding target
    y_train.loc[y_train['loan_status'] == 'Fully Paid', "loan_status"] = "0"
    y_train.loc[y_train['loan_status'] == 'Charged Off', "loan_status"] = "1"
    
    return X_train, y_train

In [4]:
scaler = sklearn.preprocessing.RobustScaler(unit_variance=True)
encoder = sklearn.preprocessing.OneHotEncoder(drop = "first", handle_unknown = "ignore")

X_train, y_train = transform_features(train, encoder, scaler)
X_test, y_test = transform_features(test, encoder, scaler, train = False)

### Vanilla Logistic Regression

In [8]:
model = sklearn.linear_model.LogisticRegression(penalty= None, random_state= 15, solver = 'saga', max_iter = 1500, tol = 5e-4)
skf = sklearn.model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 15)

cv_results = sklearn.model_selection.cross_validate(model,
                                                    X_train.values, 
                                                    y_train.values.ravel(), 
                                                    cv = skf, 
                                                    scoring = ["roc_auc", "balanced_accuracy", "f1_weighted"], 
                                                    return_train_score= True, 
                                                    return_estimator= True, 
                                                    return_indices= True)

In [9]:
cv_results

{'fit_time': array([ 86.49791813,  96.60262918,  59.73771167,  70.6770308 ,
         89.94585133,  99.40712762, 101.36551809, 121.54200315,
        118.86333704,  62.44980645]),
 'score_time': array([0.75254321, 0.29639888, 0.24563789, 0.77587652, 0.74672437,
        0.79173088, 0.77711606, 0.80190754, 0.76799417, 0.36577153]),
 'estimator': [LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='saga',
                     tol=0.0005),
  LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='saga',
                     tol=0.0005),
  LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='saga',
                     tol=0.0005),
  LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='saga',
                     tol=0.0005),
  LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='saga',
                     tol=0.0005),
  LogisticRegression(max_iter=1500, penalty=None, random_state=15, solver='sa

### Vanilla Logistic Regression with Hyperparameter Tuning

In [11]:
model = sklearn.linear_model.LogisticRegression(penalty= None,
                                                random_state= 15,
                                                solver = 'saga',
                                                max_iter = 1500,
                                                tol = 5e-4)

skf = sklearn.model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 15)

hyper_params = [{
    # 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'class_weight': ['balanced', None],
    # 'penalty': ['l1', 'l2'],
    }]

search_vanilla_cv_results = sklearn.model_selection.GridSearchCV(estimator=model,
                                                                 param_grid=hyper_params,
                                                                 refit= "roc_auc",
                                                                 cv = skf,
                                                                 scoring = ["roc_auc", "balanced_accuracy", "f1_weighted"],
                                                                 return_train_score= True,
                                                                 n_jobs= -1,
                                                                 verbose=3
                                                                 )

search_vanilla_cv_results.fit(X_train.values, y_train.values.ravel())

Fitting 10 folds for each of 2 candidates, totalling 20 fits




In [13]:
search_vanilla_cv_results.best_params_

{'class_weight': 'balanced'}

In [17]:
pd.DataFrame(search_vanilla_cv_results.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,...,split2_train_f1_weighted,split3_train_f1_weighted,split4_train_f1_weighted,split5_train_f1_weighted,split6_train_f1_weighted,split7_train_f1_weighted,split8_train_f1_weighted,split9_train_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted
0,1383.412319,11.332025,0.609209,0.174292,balanced,{'class_weight': 'balanced'},0.525797,0.529527,0.530736,0.526377,...,0.541226,0.538487,0.538087,0.541536,0.539114,0.539395,0.538329,0.539427,0.539234,0.001188
1,149.51794,14.12961,0.959448,0.106892,,{'class_weight': None},0.521201,0.519475,0.511899,0.519267,...,0.668197,0.668229,0.668218,0.668203,0.668221,0.668212,0.668206,0.668202,0.668213,1e-05


### Logistic Regression with Shrinkage Methods

In [5]:
model = sklearn.linear_model.LogisticRegression(class_weight = "balanced",
                                                random_state= 15,
                                                solver = 'saga',
                                                max_iter = 1000,
                                                tol = 5e-4)

skf = sklearn.model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 15)

hyper_params = [{
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    }]

search_pen_cv_results = sklearn.model_selection.GridSearchCV(estimator=model,
                                                                 param_grid=hyper_params,
                                                                 refit= "roc_auc",
                                                                 cv = skf,
                                                                 scoring = ["roc_auc", "balanced_accuracy", "f1_weighted"],
                                                                 return_train_score= True,
                                                                 n_jobs= -1,
                                                                 verbose=3
                                                                 )

search_pen_cv_results.fit(X_train.values, y_train.values.ravel())

Fitting 10 folds for each of 14 candidates, totalling 140 fits




In [6]:
search_pen_cv_results.best_params_

{'C': 100, 'penalty': 'l1'}

In [7]:
pd.DataFrame(search_pen_cv_results.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,...,split2_train_f1_weighted,split3_train_f1_weighted,split4_train_f1_weighted,split5_train_f1_weighted,split6_train_f1_weighted,split7_train_f1_weighted,split8_train_f1_weighted,split9_train_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted
0,898.439928,4.256667,0.565222,0.028857,0.001,l1,"{'C': 0.001, 'penalty': 'l1'}",0.521857,0.525477,0.526847,...,0.539973,0.536741,0.536271,0.5403,0.537699,0.538014,0.536559,0.537992,0.537687,0.0014
1,756.91239,7.915356,0.533437,0.033921,0.001,l2,"{'C': 0.001, 'penalty': 'l2'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385
2,908.174562,2.366855,0.485171,0.020209,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",0.522077,0.525707,0.52707,...,0.539964,0.536757,0.5363,0.540296,0.537671,0.538012,0.536558,0.53795,0.53769,0.001387
3,749.765357,13.093512,0.48855,0.03941,0.01,l2,"{'C': 0.01, 'penalty': 'l2'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385
4,932.680524,2.760354,0.4935,0.026269,0.1,l1,"{'C': 0.1, 'penalty': 'l1'}",0.5221,0.525731,0.527093,...,0.539958,0.536766,0.536303,0.540298,0.537687,0.538017,0.536556,0.537934,0.537689,0.001386
5,747.123592,9.721962,0.492705,0.032317,0.1,l2,"{'C': 0.1, 'penalty': 'l2'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385
6,938.995768,1.297224,0.49061,0.030522,1.0,l1,"{'C': 1, 'penalty': 'l1'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537934,0.537689,0.001385
7,748.597612,9.081752,0.489314,0.03487,1.0,l2,"{'C': 1, 'penalty': 'l2'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385
8,944.166444,9.913266,0.495285,0.033525,10.0,l1,"{'C': 10, 'penalty': 'l1'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385
9,742.83202,2.005947,0.494487,0.02723,10.0,l2,"{'C': 10, 'penalty': 'l2'}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537931,0.537689,0.001385


### Logistic Regression with Elastic Net

In [8]:
model = sklearn.linear_model.LogisticRegression(penalty= "elasticnet",
                                                class_weight = "balanced",
                                                random_state= 15,
                                                solver = 'saga',
                                                max_iter = 1000,
                                                tol = 5e-4)

skf = sklearn.model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = 15)

hyper_params = [{
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'l1_ratio': [0.25, 0.5, 0.75],
    }]

search_ela_cv_results = sklearn.model_selection.GridSearchCV(estimator=model,
                                                                 param_grid=hyper_params,
                                                                 refit= "roc_auc",
                                                                 cv = skf,
                                                                 scoring = ["roc_auc", "balanced_accuracy", "f1_weighted"],
                                                                 return_train_score= True,
                                                                 n_jobs= -1,
                                                                 verbose=3
                                                                 )

search_ela_cv_results.fit(X_train.values, y_train.values.ravel())

Fitting 10 folds for each of 21 candidates, totalling 210 fits




In [11]:
search_ela_cv_results.best_params_

{'C': 1000, 'l1_ratio': 0.75}

In [12]:
pd.DataFrame(search_ela_cv_results.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_l1_ratio,params,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,...,split2_train_f1_weighted,split3_train_f1_weighted,split4_train_f1_weighted,split5_train_f1_weighted,split6_train_f1_weighted,split7_train_f1_weighted,split8_train_f1_weighted,split9_train_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted
0,871.329647,1.450118,0.557402,0.030331,0.001,0.25,"{'C': 0.001, 'l1_ratio': 0.25}",0.52204,0.525668,0.527033,...,0.539957,0.536765,0.536284,0.540304,0.537687,0.537999,0.536577,0.537948,0.537689,0.001388
1,899.981763,44.579353,0.557948,0.115505,0.001,0.5,"{'C': 0.001, 'l1_ratio': 0.5}",0.521979,0.525604,0.526971,...,0.539966,0.536747,0.536258,0.540322,0.537701,0.538002,0.536601,0.537981,0.537693,0.001396
2,949.671437,3.648895,0.542425,0.136538,0.001,0.75,"{'C': 0.001, 'l1_ratio': 0.75}",0.521918,0.525541,0.526909,...,0.539961,0.53673,0.536258,0.540297,0.537693,0.538005,0.536566,0.537971,0.537682,0.001396
3,997.664731,4.097506,0.539994,0.105226,0.01,0.25,"{'C': 0.01, 'l1_ratio': 0.25}",0.522096,0.525726,0.527089,...,0.539961,0.536763,0.536303,0.540295,0.537687,0.538018,0.536558,0.537937,0.537689,0.001387
4,990.125069,3.82101,0.539357,0.107352,0.01,0.5,"{'C': 0.01, 'l1_ratio': 0.5}",0.52209,0.52572,0.527083,...,0.539963,0.536763,0.536298,0.540295,0.537684,0.538015,0.536556,0.537944,0.53769,0.001388
5,982.775595,2.865725,0.510186,0.114289,0.01,0.75,"{'C': 0.01, 'l1_ratio': 0.75}",0.522083,0.525714,0.527076,...,0.539969,0.53676,0.536295,0.540301,0.537679,0.538015,0.536558,0.537945,0.537691,0.001389
6,1009.803996,6.751737,0.63698,0.147533,0.1,0.25,"{'C': 0.1, 'l1_ratio': 0.25}",0.522102,0.525732,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537934,0.537689,0.001385
7,1013.186075,4.476146,0.586085,0.165052,0.1,0.5,"{'C': 0.1, 'l1_ratio': 0.5}",0.522101,0.525732,0.527094,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537934,0.537689,0.001385
8,1006.63442,3.798242,0.537635,0.134037,0.1,0.75,"{'C': 0.1, 'l1_ratio': 0.75}",0.5221,0.525731,0.527094,...,0.539958,0.536766,0.536303,0.540295,0.537687,0.538017,0.536556,0.537934,0.537689,0.001386
9,1012.084259,1.842138,0.598893,0.127561,1.0,0.25,"{'C': 1, 'l1_ratio': 0.25}",0.522102,0.525733,0.527095,...,0.539958,0.536766,0.536303,0.540295,0.537684,0.538017,0.536556,0.537934,0.537689,0.001385
