In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from statsmodels.sandbox.regression.gmm import IV2SLS 
from statsmodels.sandbox.regression.gmm import GMM
import statsmodels.api as sm
import matplotlib as mp
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [19]:
df = pd.read_csv('/Users/youziya/OneDrive - York University/MBAN 5110 Predictive Modelling/Midterm/midterm_partone.csv')

In [20]:
df.head()

Unnamed: 0,Constant,Stock Change,Inventory Turnover,Operating Profit,Interaction Effect,Current Ratio,Quick Ratio,Debt Asset Ratio
0,1,0.870332,1.795946,0.115846,0.208053,1.672527,0.255171,0.473317
1,1,-0.047347,1.395501,0.436967,0.609788,1.637261,0.221763,0.489967
2,1,0.001176,1.664563,0.541016,0.900555,1.640619,0.189141,0.374269
3,1,-0.9012,1.605738,0.539399,0.866133,1.436221,0.131944,0.224399
4,1,-0.176353,1.591451,0.539938,0.859285,1.43314,0.183095,0.213446


In [21]:
#run the ols regression
model_iv = sm.OLS(df["Inventory Turnover"],df[["Constant","Current Ratio","Quick Ratio",\
                                                                 "Debt Asset Ratio"]]).fit()
#making predictions, note that only the independent variables are in predictions 
endog_predict = model_iv.predict(df[["Constant","Current Ratio","Quick Ratio","Debt Asset Ratio"]])
#adding predictions to data table
df["Endogenous Param"] = endog_predict

In [22]:
model_2sls = sm.OLS(df["Stock Change"], df[["Constant","Endogenous Param",\
                                                              "Operating Profit","Interaction Effect",\
                                                             ]]).fit()
model_2sls.summary()

0,1,2,3
Dep. Variable:,Stock Change,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,8.53
Date:,"Sat, 11 Nov 2023",Prob (F-statistic):,1.27e-05
Time:,18:19:16,Log-Likelihood:,-1186.5
No. Observations:,1696,AIC:,2381.0
Df Residuals:,1692,BIC:,2403.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,-0.0176,0.020,-0.896,0.370,-0.056,0.021
Endogenous Param,0.0011,0.001,1.827,0.068,-7.76e-05,0.002
Operating Profit,-0.1201,0.028,-4.319,0.000,-0.175,-0.066
Interaction Effect,0.0014,0.000,3.621,0.000,0.001,0.002

0,1,2,3
Omnibus:,368.832,Durbin-Watson:,2.243
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3433.92
Skew:,0.742,Prob(JB):,0.0
Kurtosis:,9.811,Cond. No.,109.0


### GMM

In [23]:
y_vals  = np.array(df["Stock Change"])
x_vals  = np.array(df[["Inventory Turnover","Operating Profit","Interaction Effect"]])
iv_vals = np.array(df[["Current Ratio","Quick Ratio","Debt Asset Ratio"]])

In [24]:
class gmm(GMM):
    def momcond(self, params):
        # Now includes delta as the last parameter
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument

        # Calculate errors
        errors = endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]
        
        # Adjust moment conditions for delta
        g = np.column_stack((
            errors - delta,
            errors * exog[:, 0] - delta,
            errors * exog[:, 1] - delta,
            errors * inst[:, 0] - delta,
            errors * inst[:, 1] - delta,
            errors * inst[:, 2] - delta
        ))
        return g

# Update the initial values for the parameters, including delta
beta0 = np.array([0.1, 0.1, 0.1, 0.1, 0.1])  # Added an initial guess for delta

# Fit the model
res = gmm(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=6, k_params=5).fit(beta0)

res.summary()

Optimization terminated successfully.
         Current function value: 0.000007
         Iterations: 9
         Function evaluations: 17
         Gradient evaluations: 17
Optimization terminated successfully.
         Current function value: 0.000167
         Iterations: 7
         Function evaluations: 15
         Gradient evaluations: 15
Optimization terminated successfully.
         Current function value: 0.000142
         Iterations: 19
         Function evaluations: 22
         Gradient evaluations: 22
Optimization terminated successfully.
         Current function value: 0.000143
         Iterations: 6
         Function evaluations: 9
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: 0.000143
         Iterations: 2
         Function evaluations: 4
         Gradient evaluations: 4


0,1,2,3
Dep. Variable:,y,Hansen J:,0.2426
Model:,gmm,Prob (Hansen J):,0.622
Method:,GMM,,
Date:,"Sat, 11 Nov 2023",,
Time:,18:19:16,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.2038,0.193,-1.056,0.291,-0.582,0.174
p 1,0.0046,0.004,1.091,0.275,-0.004,0.013
p 2,0.6549,0.769,0.852,0.394,-0.852,2.161
p 3,-0.0143,0.017,-0.844,0.399,-0.047,0.019
p 4,0.0071,0.005,1.423,0.155,-0.003,0.017


In [28]:
class gmm(GMM):
    def momcond(self, params):
        # Include delta as the last parameter
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument   

        # Calculate errors
        errors = endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]

        # Adjust moment conditions with the delta term
        g = np.column_stack((
            errors - delta,
            (errors - delta) * exog[:, 0],
            (errors - delta) * exog[:, 1],
            (errors - delta) * exog[:, 2],
            (errors - delta) * inst[:, 0],
            (errors - delta) * inst[:, 1],
            (errors - delta) * inst[:, 2]
        ))
        return g

# Update the initial values for the parameters, including delta
beta0 = np.array([0.1, 0.1, 0.1, 0.1, 0.1])  # Added an initial guess for delta

# Fit the model
res = gmm(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=7, k_params=5).fit(beta0)

res.summary()


Optimization terminated successfully.
         Current function value: 0.000101
         Iterations: 5
         Function evaluations: 10
         Gradient evaluations: 10
Optimization terminated successfully.
         Current function value: 0.001766
         Iterations: 8
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.001744
         Iterations: 6
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.001744
         Iterations: 1
         Function evaluations: 3
         Gradient evaluations: 3


0,1,2,3
Dep. Variable:,y,Hansen J:,2.958
Model:,gmm,Prob (Hansen J):,0.228
Method:,GMM,,
Date:,"Sat, 11 Nov 2023",,
Time:,18:21:11,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.0006,8.15e+05,-7.02e-10,1.000,-1.6e+06,1.6e+06
p 1,0.0004,0.000,1.024,0.306,-0.000,0.001
p 2,-0.1188,0.031,-3.861,0.000,-0.179,-0.059
p 3,0.0014,0.000,3.645,0.000,0.001,0.002
p 4,-0.0006,8.15e+05,-7.02e-10,1.000,-1.6e+06,1.6e+06


Hansen J Statistic and its Probability:

The Hansen J statistic value is 0.2426, and the probability (p-value) associated with it is 0.622.
This high p-value (greater than the conventional threshold of 0.05) suggests that we cannot reject the null hypothesis of the validity of the instrumental variables. In simpler terms, the instruments appear to be appropriate for the model.
Coefficient Estimates:

The coefficients (p0, p1, p2, p3, p4) have been estimated with their respective standard errors and z-scores.
None of the coefficients appear to be statistically significant at conventional levels (e.g., 0.05) since all the p-values associated with them are greater than this threshold.
Interpretation of the Coefficients:

The coefficients represent the estimated impact of each variable (and potentially the bias 
δ) on the dependent variable 
The lack of statistical significance suggests that the data does not provide strong evidence to confirm the relationships modeled between these variables and the dependent variable.
Industry Expert's Claim:

The fact that the Hansen J test does not reject the null hypothesis and that the coefficients, including the term for 
δ, are not statistically significant, suggests that the data does not strongly support the expert's claim of a bias.
Considerations:

The results should be interpreted with caution, as the lack of statistical significance could also be due to other factors such as insufficient sample size, poor model specification, or weak instruments.
It's also important to consider the context and theoretical foundation of the model. If the expert's claim about the bias has a strong theoretical basis, it might still be worth considering despite the statistical results.
In summary, based on the GMM results, the data does not provide strong evidence to support the industry expert's claim of a bias in the moment conditions. However, careful consideration of the model specification, the validity of the instruments, and the theoretical underpinnings of the claim is essential before drawing a firm conclusion.

### Part2

In [30]:
df2 = pd.read_csv('/Users/youziya/OneDrive - York University/MBAN 5110 Predictive Modelling/Midterm/midterm_parttwo.csv')

In [31]:
df2.head()

Unnamed: 0,Years of Education after High School,Requested Credit Amount,Number of Dependents,Monthly Income,Monthly Expense,Marital Status,Credit Rating
0,1,Low,No dependent,Very low,Very low,Married,Positive
1,2,Low,No dependent,Very low,Very low,Single,Positive
2,1,Low,No dependent,Very low,Very low,Single,Positive
3,3,Low,No dependent,Very low,Very low,Married,Positive
4,3,Low,No dependent,Very low,Very low,Single,Negative


In [36]:
categorical_cols = ['Requested Credit Amount', 'Number of Dependents', 'Monthly Income', 
                    'Monthly Expense', 'Marital Status']

column_transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), categorical_cols)
], remainder='passthrough')

# Applying the transformation to the data
X = column_transformer.fit_transform(df2.drop('Credit Rating', axis=1))

# Encoding the target variable (Credit Rating)
y = df2['Credit Rating'].apply(lambda x: 1 if x == 'Positive' else 0)

# Splitting the data into training and test sets (50% each)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)


In [38]:
# Create a logistic regression model instance
model = LogisticRegression()

# Fit the model with training data
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
# Predicting on the test set
y_pred = model.predict(X_test)

# Calculating the confusion matrix, recall, precision, and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nRecall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

Confusion Matrix:
[[   0  571]
 [   0 3470]]

Recall: 1.0
Precision: 0.8586983419945559
F1 Score: 0.9239781653574758


In [43]:
# Training a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predicting probabilities on the test set
y_probs = clf.predict_proba(X_test)[:, 1]

# Finding the threshold that corresponds to the top 15% predictions
threshold = sorted(y_probs, reverse=True)[int(0.15 * len(y_probs))]

# Making predictions based on the new threshold
y_pred_new = (y_probs >= threshold).astype(int)

# Calculating the updated metrics
cm = confusion_matrix(y_test, y_pred_new)
recall = recall_score(y_test, y_pred_new)
precision = precision_score(y_test, y_pred_new)
f1 = f1_score(y_test, y_pred_new)

print("Threshold:", threshold)
print("Confusion Matrix:\n", cm)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

Threshold: 0.9535677177444714
Confusion Matrix:
 [[ 480   91]
 [2941  529]]
Recall: 0.15244956772334295
Precision: 0.853225806451613
F1 Score: 0.258679706601467
