In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder


In [11]:
df = pd.read_csv('/Users/youziya/OneDrive - York University/MBAN 5110 Predictive Modelling/Midterm/midterm_partone.csv')

In [9]:
# Define the variables
Y = df['Stock Change']
X = df[['Constant', 'Inventory Turnover', 'Operating Profit', 'Interaction Effect', 'Current Ratio', 'Quick Ratio', 'Debt Asset Ratio']]
Z = X  # Assuming that the exogenous variables can serve as instruments

# Define the moment conditions function
def moment_conditions(params, endog, exog, instruments):
    residuals = endog - np.dot(exog, params)
    moments = np.dot(instruments.T, residuals)
    return moments

# Define the GMM objective function
def gmm_objective(params, endog, exog, instruments, W):
    moms = moment_conditions(params, endog, exog, instruments)
    return np.dot(np.dot(moms, W), moms)

# Initial weighting matrix as identity
initial_W = np.eye(Z.shape[1])

# Initial guess for the parameters
initial_params = np.zeros(X.shape[1])

# Optimization to minimize the GMM objective function
res = minimize(
    fun=gmm_objective,
    x0=initial_params,
    args=(Y, X, Z, initial_W),
    method='BFGS'
)

# Estimate the bias term delta as the mean of the product of residuals and instruments
estimated_residuals = Y - np.dot(X, res.x)
delta_estimate = np.dot(Z.T, estimated_residuals).mean()

# Define the biased moment conditions
def biased_moment_conditions(params, endog, exog, instruments, delta):
    residuals = endog - np.dot(exog, params)
    moments = np.dot(instruments.T, residuals) - delta * np.ones(instruments.shape[1])
    return moments

# Define the GMM objective function including the bias term
def gmm_objective_with_bias(params, endog, exog, instruments, W, delta):
    moms = biased_moment_conditions(params, endog, exog, instruments, delta)
    return np.dot(moms.T, np.dot(W, moms))

# Minimize the GMM objective with the bias term included
res_with_bias = minimize(
    fun=gmm_objective_with_bias,
    x0=res.x,  # Use the parameters estimated from the unbiased model as starting values
    args=(Y, X, Z, initial_W, delta_estimate),
    method='BFGS'
)

# Output the estimated parameters and the bias term
print("Estimated parameters without bias term:", res.x)
print("Estimated bias term delta:", delta_estimate)
print("Estimated parameters with bias term:", res_with_bias.x)


Estimated parameters without bias term: [ 5.04816747e-02 -9.41202869e-05 -1.16354477e-01  1.36601795e-03
 -3.73644421e-02  3.23013255e-02 -1.54543871e-04]
Estimated bias term delta: -0.6499832798045937
Estimated parameters with bias term: [ 5.04865181e-02 -9.43949281e-05 -1.16353086e-01  1.36617673e-03
 -3.73482580e-02  3.23108198e-02 -1.53977719e-04]


In [14]:
df2 = pd.read_csv('/Users/youziya/OneDrive - York University/MBAN 5110 Predictive Modelling/Midterm/midterm_parttwo.csv')

### Original Matrix

In [15]:

# One-hot encode the categorical variables
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid dummy variable trap
categorical_columns = ['Requested Credit Amount', 'Number of Dependents', 'Monthly Income', 'Monthly Expense', 'Marital Status']
X_categorical = encoder.fit_transform(df2[categorical_columns])

# Combine the one-hot encoded columns with the continuous ones
X_continuous = df2[['Years of Education after High School']].values
X = np.hstack((X_continuous, X_categorical))

# Convert the target variable 'Credit Rating' into binary (0, 1)
Y = (df2['Credit Rating'] == 'Positive').astype(int).values

# Split the data into training (50%) and test (50%) sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=0)

# Fit the logistic regression model on the training set
log_reg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
log_reg.fit(X_train, Y_train)

# Predict the Credit Rating on the test set
Y_pred = log_reg.predict(X_test)

# Calculate the confusion matrix, recall, precision, and F1 score
conf_matrix = confusion_matrix(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Get the predicted probabilities for the positive class (1)
Y_probs = log_reg.predict_proba(X_test)[:, 1]

# Find the threshold that will result in 15% of applications being approved
threshold_15_percent = np.percentile(Y_probs, 85)  # 100% - 15% = 85%

# Apply the new threshold to determine credit approvals
Y_pred_adjusted = (Y_probs >= threshold_15_percent).astype(int)

# Calculate the new confusion matrix, recall, precision, and F1 score with the adjusted threshold
conf_matrix_adjusted = confusion_matrix(Y_test, Y_pred_adjusted)
recall_adjusted = recall_score(Y_test, Y_pred_adjusted)
precision_adjusted = precision_score(Y_test, Y_pred_adjusted)
f1_adjusted = f1_score(Y_test, Y_pred_adjusted)

# Output the new evaluation metrics with the adjusted threshold
(conf_matrix_adjusted, recall_adjusted, precision_adjusted, f1_adjusted, threshold_15_percent)


(array([[ 485,   86],
        [2846,  624]]),
 0.17982708933717578,
 0.8788732394366198,
 0.2985645933014354,
 0.8785019080284596)

### New Bank Metric

In [16]:

# One-hot encode the categorical variables
encoder = OneHotEncoder(sparse=False, drop='first')  # Avoid dummy variable trap
categorical_columns = ['Requested Credit Amount', 'Number of Dependents', 
                       'Monthly Income', 'Monthly Expense', 'Marital Status']
X_categorical = encoder.fit_transform(df2[categorical_columns])

# Combine the one-hot encoded columns with the continuous ones
X_continuous = df2[['Years of Education after High School']].values
X = np.hstack((X_continuous, X_categorical))

# Convert the target variable 'Credit Rating' to binary (0, 1)
Y = (df2['Credit Rating'] == 'Positive').astype(int).values

# Split the data into training (50%) and test (50%) sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=0)

# Fit the logistic regression model on the training set
log_reg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
log_reg.fit(X_train, Y_train)

# Predict the Credit Rating on the test set using the default threshold
Y_pred = log_reg.predict(X_test)

# Calculate the confusion matrix, recall, precision, and F1 score
conf_matrix = confusion_matrix(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Get the predicted probabilities for the positive class (1)
Y_probs = log_reg.predict_proba(X_test)[:, 1]

# Find the threshold that results in 15% of applications being approved
threshold_15_percent = np.percentile(Y_probs, 85)  # 100% - 15% = 85%

# Apply the new threshold to determine credit approvals
Y_pred_adjusted = (Y_probs >= threshold_15_percent).astype(int)

# Calculate the new confusion matrix, recall, precision, and F1 score with the adjusted threshold
conf_matrix_adjusted = confusion_matrix(Y_test, Y_pred_adjusted)
recall_adjusted = recall_score(Y_test, Y_pred_adjusted)
precision_adjusted = precision_score(Y_test, Y_pred_adjusted)
f1_adjusted = f1_score(Y_test, Y_pred_adjusted)

# Output the new evaluation metrics with the adjusted threshold
original_metrics = {
    "Confusion Matrix": conf_matrix.tolist(),
    "Recall": recall,
    "Precision": precision,
    "F1 Score": f1
}

adjusted_metrics = {
    "Confusion Matrix": conf_matrix_adjusted.tolist(),
    "Recall": recall_adjusted,
    "Precision": precision_adjusted,
    "F1 Score": f1_adjusted,
    "Threshold": threshold_15_percent
}

original_metrics, adjusted_metrics


({'Confusion Matrix': [[0, 571], [0, 3470]],
  'Recall': 1.0,
  'Precision': 0.8586983419945559,
  'F1 Score': 0.9239781653574758},
 {'Confusion Matrix': [[485, 86], [2846, 624]],
  'Recall': 0.17982708933717578,
  'Precision': 0.8788732394366198,
  'F1 Score': 0.2985645933014354,
  'Threshold': 0.8785019080284596})