In [4]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#Ensure Python can find the Code directory
sys.path.append(os.path.abspath(".."))

import utils

In [5]:
#Use functions and packages
df = utils.load_data()
df.head()

Unnamed: 0,activity_year,census_tract,derived_race,action_taken,loan_amount,property_value,income,interest_rate,tract_minority_population_percent,tract_to_msa_income_percentage,denial_reason-1,covenant_count,was_approved,covenant_density,county
0,2023,27037060000.0,Race Not Available,6,185000.0,195000,,6.125,40.37,73.48,10,0.0,False,,Hennepin
1,2023,27141030000.0,Race Not Available,6,375000.0,385000,,6.625,10.64,121.26,10,0.0,False,,Hennepin
2,2023,27023950000.0,Race Not Available,6,105000.0,105000,,6.125,8.92,90.13,10,0.0,False,,Hennepin
3,2023,27171100000.0,Race Not Available,6,285000.0,285000,,6.75,13.27,104.37,10,0.0,False,,Hennepin
4,2023,27159480000.0,Race Not Available,6,175000.0,175000,,6.875,5.87,82.22,10,0.0,False,,Hennepin


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201862 entries, 0 to 201861
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   activity_year                      201862 non-null  int64  
 1   census_tract                       199830 non-null  float64
 2   derived_race                       201862 non-null  object 
 3   action_taken                       201862 non-null  int64  
 4   loan_amount                        201862 non-null  float64
 5   property_value                     177186 non-null  object 
 6   income                             176400 non-null  float64
 7   interest_rate                      166454 non-null  object 
 8   tract_minority_population_percent  201862 non-null  float64
 9   tract_to_msa_income_percentage     201862 non-null  float64
 10  denial_reason-1                    201862 non-null  int64  
 11  covenant_count                     2018

### Explore missing values

In [None]:
#For each column, show the number of missing values for each value of census_tract as a table
missing = df.groupby('census_tract').apply(lambda x: x.isnull().sum(), include_groups = False)
missing.head()

In [None]:
#Fill the property_value, income, and interest_rate columns with census_tract means
df['property_value'] = df['property_value'].fillna(df.groupby('census_tract')['property_value'].transform('mean'))
df['income'] = df['income'].fillna(df.groupby('census_tract')['income'].transform('mean'))
df['interest_rate'] = df['interest_rate'].fillna(df.groupby('census_tract')['interest_rate'].transform('mean'))

### Data Preparation for Classification

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import make_pipeline

In [None]:
#Create dummy variables for the categorical columns
categorical_features = ['derived_race', 'action_taken', 'denial_reason-1']
df_clean = pd.get_dummies(df, columns=categorical_features, drop_first=True, dtype='int64')

In [None]:
#Drop additional columns
df_clean.drop(columns=['covenant_density', 'census_tract','activity_year', 'tract_minority_population_percent', 'tract_to_msa_income_percentage', 'covenant_count'], inplace=True)

In [None]:
df_clean.info()

In [None]:
#Impute missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
df_clean_imputed = imputer.fit_transform(df_clean)
df_clean_imputed = pd.DataFrame(df_clean_imputed, columns=df_clean.columns)

In [None]:
#Split the data into features and target
X = df_clean_imputed.drop('was_approved', axis=1)
y = df_clean_imputed['was_approved']

In [None]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Predictive Modeling - Based on Hennepin County dataset

#### Logistic Regression

In [None]:
#Initialize and fit the Logistic Regression model
lr = LogisticRegression(solver='liblinear', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
#Evaluate the model accuracy on the training and test sets
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")

#Display the confusion matrix for the training set
print('Confusion Matrix - Training Set')
print(confusion_matrix(y_train_pred, y_train))
print('Confusion Matrix - Test Set')
print(confusion_matrix(y_test_pred, y_test))

In [None]:
#Plot the residuals
y_pred_lr = lr.predict(X_test).astype(int)
plt.figure(figsize=(20, 10))
plt.scatter(range(len(y_pred_lr)), y_test - y_pred_lr)
plt.axhline(0, color='black', lw=2)

#### KNN Classification

In [None]:
#Create the cross-validation method
cv_method = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
#Define the search grid and number of neighbors
search_grid = dict()
search_grid['n_neighbors'] = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
#Execute the grid search
best_score = 0
#Minimum improvement threshold
tolerance = 0.001
#Number of iterations to wait before stopping  
patience = 3  
wait = 0
best_n = None
best_model = None
for n in search_grid['n_neighbors']:
    model = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(model, X_train, y_train, cv=cv_method, scoring='accuracy', n_jobs=-1)
    mean_score = np.mean(scores)

    if mean_score > best_score + tolerance:
        best_score = mean_score
        best_n = n
        wait = 0  # Reset patience counter
    else:
        wait += 1

    if wait >= patience:
        print(f"Early stopping triggered. Best n_neighbors: {best_n} with accuracy {best_score:.4f}")
        break

print(f"Final best n_neighbors: {best_n} with accuracy {best_score:.4f}")


In [None]:
#Run the KNN model with the best parameters
k = best_n
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)


In [None]:
#Evaluate the model accuracy on the training and test sets
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")

#Display the confusion matrix for the training set
print('Confusion Matrix - Training Set')
print(confusion_matrix(y_train_pred, y_train))
print('Confusion Matrix - Test Set')
print(confusion_matrix(y_test_pred, y_test))


#### Support Vector Classification

In [None]:
SVCrbf = make_pipeline(StandardScaler(), SVC(kernel='rbf',max_iter=-1))
SVCrbf.fit(X_train, y_train)

In [None]:
#Evaluate the model accuracy on the training and test sets
y_train_pred = SVCrbf.predict(X_train)
y_test_pred = SVCrbf.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")

#Display the confusion matrix for the training set
print('Confusion Matrix - Training Set')
print(confusion_matrix(y_train_pred, y_train))
print('Confusion Matrix - Test Set')
print(confusion_matrix(y_test_pred, y_test))

#### Decision Tree Classification

In [None]:
#Initialize and fit the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
#Evaluate the model accuracy on the training and test sets
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train accuracy: {train_accuracy:.3f}")
print(f"Test accuracy: {test_accuracy:.3f}")

#Display the confusion matrix for the training set
print('Confusion Matrix - Training Set')
print(confusion_matrix(y_train_pred, y_train))
print('Confusion Matrix - Test Set')
print(confusion_matrix(y_test_pred, y_test))

#### Analysis and Comparison of Model Performance

### Predictive Modeling - Applying the models to Ramsey County dataset

### Limitations