In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
# import data
train_df = pd.read_csv('/content/sample_data/TrainingData.csv')
test_df = pd.read_csv('/content/sample_data/TestingData.csv')

In [None]:
train_df.head()

Unnamed: 0,Age,Annual Income,Credit Score,Experience,Loan Amount,Loan Duration,Number of Dependents,Monthly Debt Payment,Creditcard Utilizatio Rate,Number of Open Credit Lines,...,Total Assets,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,45,39948,617,22,13152,48,2,183,0.354418,1,...,146111,19183,3329.0,0.724972,11,126928,0.22759,419.805992,0.181077,0
1,38,39709,628,15,26045,48,1,496,0.087827,5,...,53204,9595,3309.083333,0.935132,3,43609,0.201077,794.054238,0.389852,0
2,47,40724,570,26,17627,36,2,902,0.137414,2,...,25176,128874,3393.666667,0.872241,6,5205,0.212548,666.406688,0.462157,0
3,58,69084,545,34,37898,96,1,755,0.267587,2,...,104822,5370,5757.0,0.896155,5,99452,0.300911,1047.50698,0.313098,0
4,58,51250,564,39,12741,48,0,337,0.36738,6,...,65624,43894,4270.833333,0.884275,5,21730,0.205271,391.300352,0.170529,0


In [None]:
test_df.head()

Unnamed: 0,Age,Annual Income,Credit Score,Experience,Loan Amount,Loan Duration,Number of Dependents,Monthly Debt Payment,Creditcard Utilizatio Rate,Number of Open Credit Lines,...,Total Assets,TotalLiabilities,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,53,42478,552,33,10463,72,2,597,0.383507,5,...,27296,30757,3539.833333,0.972212,7,5996,0.221873,264.059369,0.243249,0
1,54,25911,571,28,10751,60,4,544,0.321581,2,...,15047,65767,2159.25,0.841848,5,5576,0.244892,312.344769,0.396594,0
2,18,27112,432,0,32743,72,0,821,0.531712,1,...,48699,8253,2259.333333,0.79494,4,40446,0.378836,1157.134465,0.875539,0
3,35,54015,612,14,13313,60,3,298,0.196421,1,...,80818,3399,4501.25,0.754604,4,77419,0.202899,354.864257,0.145041,0
4,35,32033,585,10,24194,12,2,302,0.44802,4,...,127782,80487,2669.416667,0.910957,5,47295,0.225888,2271.284384,0.963988,0


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          900 non-null    int64  
 1   Annual Income                900 non-null    int64  
 2   Credit Score                 900 non-null    int64  
 3   Experience                   900 non-null    int64  
 4   Loan Amount                  900 non-null    int64  
 5   Loan Duration                900 non-null    int64  
 6   Number of Dependents         900 non-null    int64  
 7   Monthly Debt Payment         900 non-null    int64  
 8   Creditcard Utilizatio Rate   900 non-null    float64
 9   Number of Open Credit Lines  900 non-null    int64  
 10  Number of Credit Inquries    900 non-null    int64  
 11  Debt to Income Ratio         900 non-null    float64
 12  Bankruptcy History           900 non-null    int64  
 13  Previous Loan Defaul

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          400 non-null    int64  
 1   Annual Income                400 non-null    int64  
 2   Credit Score                 400 non-null    int64  
 3   Experience                   400 non-null    int64  
 4   Loan Amount                  400 non-null    int64  
 5   Loan Duration                400 non-null    int64  
 6   Number of Dependents         400 non-null    int64  
 7   Monthly Debt Payment         400 non-null    int64  
 8   Creditcard Utilizatio Rate   400 non-null    float64
 9   Number of Open Credit Lines  400 non-null    int64  
 10  Number of Credit Inquries    400 non-null    int64  
 11  Debt to Income Ratio         400 non-null    float64
 12  Bankruptcy History           400 non-null    int64  
 13  Previous Loan Defaul

In [None]:
# separate features and labels
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values
X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

In [None]:
# standardize features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# define the function to compute Type 1 and Type 2 error rates
def calculate_error_rates(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    type_1_error_rate = fn / (fn + tp)
    type_2_error_rate = fp / (fp + tn)
    return type_1_error_rate, type_2_error_rate

### 1. Binary Classifiers: Part 1 - Original Feature
In the first part of the project, you will use the samples in ‘TrainingData.csv’ to train the following
four (4) classifiers. You will evaluate the performance of each classifier by considering two different types
of errors for the testing data: a type 1 error happens when your classifier chooses ‘Denied’ for an application
that was ’Approved’ in the testing data. A type 2 error occurs when your classifier selects ‘Approved’ for a
‘Denied’ application in the testing data. For each classifier you design, using the testing data, compute the
type 1 error rate (which is the number of type 1 errors divided by 200) and the type 2 error rate (which is the
number of type 2 errors divided by 200).
Some basic instructions are provided below. However, you are allowed to choose the algorithms used for
these classifiers. For example, for a decision tree, you can use any of the following algorithms – ID3, C4.5,
CART (classification and regression tree). Please make sure to set them up correctly when using built-in
algorithms/functions and explain which algorithms are used in your code (this can be embedded in your
codes as comments).

**LDA**

In [None]:
# compute class means
mean_approved = X_train[y_train == 0].mean(axis=0)
mean_denied = X_train[y_train == 1].mean(axis=0)

In [None]:
# compute within-class scatter matrix(S_W)
S_W = np.cov(X_train[y_train == 0], rowvar=False) + np.cov(X_train[y_train == 1], rowvar=False)

In [None]:
# find the optimal projection direction w using the LDA formula: w = S_W^(-1) * (mean_approved - mean_denied)
w = np.linalg.inv(S_W).dot((mean_approved - mean_denied).reshape(-1, 1)).flatten()

In [None]:
# project training and testing data onto direction w
y_test_proj = X_test.dot(w)

In [None]:
# find the one that minimizes Type 1 and Type 2 errors
thresholds = np.linspace(y_test_proj.min(), y_test_proj.max(), 100)
errors = [calculate_error_rates(y_test, (y_test_proj >= t).astype(int)) for t in thresholds]
best_threshold_idx = np.argmin([e[0] + e[1] for e in errors])
best_type_1_error, best_type_2_error = errors[best_threshold_idx]
best_threshold = thresholds[best_threshold_idx]

In [None]:
# Print results for LDA
print("LDA - Best Threshold:", best_threshold)
print("LDA - Type 1 Error:", best_type_1_error, "Type 2 Error:", best_type_2_error)

LDA - Best Threshold: -6.278495330537056
LDA - Type 1 Error: 0.0 Type 2 Error: 1.0


**Decision Tree**

In [None]:
# using gini impurity or entropy initiate desition tree
tree_clf = DecisionTreeClassifier(criterion="gini")
tree_clf.fit(X_train, y_train)

In [None]:
# Evaluate on test data
y_pred_tree = tree_clf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_tree).ravel()
type_1_error_tree = fn / (fn + tp)
type_2_error_tree = fp / (fp + tn)
print("Decision Tree Type 1 Error Rate:", type_1_error_tree)
print("Decision Tree Type 2 Error Rate:", type_2_error_tree)

Decision Tree Type 1 Error Rate: 0.17
Decision Tree Type 2 Error Rate: 0.2


**kNN**

In [None]:
# evaluate given values of k
k_values = [1, 3, 5, 10]
for k in k_values:
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    y_pred_knn = knn_clf.predict(X_test)

    # compute Type 1 and Type 2 error rates
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
    type_1_error_knn = fn / (fn + tp)
    type_2_error_knn = fp / (fp + tn)
    print(f"kNN (k={k}) Type 1 Error Rate: {type_1_error_knn}")
    print(f"kNN (k={k}) Type 2 Error Rate: {type_2_error_knn}")

kNN (k=1) Type 1 Error Rate: 0.19
kNN (k=1) Type 2 Error Rate: 0.265
kNN (k=3) Type 1 Error Rate: 0.115
kNN (k=3) Type 2 Error Rate: 0.275
kNN (k=5) Type 1 Error Rate: 0.085
kNN (k=5) Type 2 Error Rate: 0.26
kNN (k=10) Type 1 Error Rate: 0.125
kNN (k=10) Type 2 Error Rate: 0.225


**SVM**

In [None]:
# soft-margin SVM
svm_clf = SVC(kernel='linear', C=1)
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

In [None]:
# get rates for Type 1 and Type 2 error
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_svm).ravel()
type_1_error_svm = fn / (fn + tp)
type_2_error_svm = fp / (fp + tn)
print("SVM Type 1 Error Rate:", type_1_error_svm)
print("SVM Type 2 Error Rate:", type_2_error_svm)

SVM Type 1 Error Rate: 0.02
SVM Type 2 Error Rate: 0.125


### 2. Binary classifiers: Part 2 - Use the PCA to Design New Feature

In [None]:
# set up loop over PCA dimensions 5, 10, and 15 and evaluate kNN and SVM on PCA-reduced features
for dim in [5, 10, 15]:
    # apply PCA to reduce features to 'dim' dimensions
    pca = PCA(n_components=dim)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    print(f"Results with PCA (dim={dim}):")

    # kNN: testing various k values
    for k in [1, 3, 5, 10]:
        knn_clf_pca = KNeighborsClassifier(n_neighbors=k)
        knn_clf_pca.fit(X_train_pca, y_train)
        y_pred_knn_pca = knn_clf_pca.predict(X_test_pca)
        type_1_error_knn_pca, type_2_error_knn_pca = calculate_error_rates(y_test, y_pred_knn_pca)
        print(f"kNN (k={k}) - Type 1 Error: {type_1_error_knn_pca:.3f}, Type 2 Error: {type_2_error_knn_pca:.3f}")

    # SVM
    svm_clf_pca = SVC(kernel='linear', C=1)
    svm_clf_pca.fit(X_train_pca, y_train)
    y_pred_svm_pca = svm_clf_pca.predict(X_test_pca)
    type_1_error_svm_pca, type_2_error_svm_pca = calculate_error_rates(y_test, y_pred_svm_pca)
    print("SVM - Type 1 Error:", type_1_error_svm_pca, "Type 2 Error:", type_2_error_svm_pca)

Results with PCA (dim=5):
kNN (k=1) - Type 1 Error: 0.155, Type 2 Error: 0.205
kNN (k=3) - Type 1 Error: 0.105, Type 2 Error: 0.160
kNN (k=5) - Type 1 Error: 0.100, Type 2 Error: 0.180
kNN (k=10) - Type 1 Error: 0.100, Type 2 Error: 0.155
SVM - Type 1 Error: 0.075 Type 2 Error: 0.14
Results with PCA (dim=10):
kNN (k=1) - Type 1 Error: 0.210, Type 2 Error: 0.245
kNN (k=3) - Type 1 Error: 0.155, Type 2 Error: 0.220
kNN (k=5) - Type 1 Error: 0.115, Type 2 Error: 0.200
kNN (k=10) - Type 1 Error: 0.140, Type 2 Error: 0.140
SVM - Type 1 Error: 0.045 Type 2 Error: 0.155
Results with PCA (dim=15):
kNN (k=1) - Type 1 Error: 0.180, Type 2 Error: 0.240
kNN (k=3) - Type 1 Error: 0.150, Type 2 Error: 0.235
kNN (k=5) - Type 1 Error: 0.110, Type 2 Error: 0.270
kNN (k=10) - Type 1 Error: 0.140, Type 2 Error: 0.195
SVM - Type 1 Error: 0.025 Type 2 Error: 0.15
