In [25]:
import pandas as pd
pd.set_option("max_colwidth", None)

import pycaret
import numpy as np
import matplotlib.pyplot as plt
from pycaret.classification import * 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from functions.homebrew import *
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from itertools import combinations
import pickle
import os

# If you're using statsmodels or ISLP for specific tasks, keep these imports
import statsmodels.api as sm
# Assuming ISLP and homebrew are custom modules specific to your project
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS, summarize, contrast

# LOAD DATA

In [6]:
df = pd.read_csv('./data/df.csv').drop('Unnamed: 0', axis=1)

In [7]:
train = df[df['type'] == 'train'].drop('type',axis =1)
dev = df[df['type'] == 'dev'].drop('type',axis =1)
test = df[df['type'] == 'test'].drop('type',axis =1)

# VIF

In [8]:
dummies = pd.get_dummies(train, drop_first=True)

kept, removed = remove_high_vif_features(X=dummies.drop('target_No Donor', axis=1), y=dummies['target_No Donor'], vif_threshold=10)
print('REMOVED:', removed)


train = dummies.drop(columns=removed)
dev = pd.get_dummies(dev, drop_first=True).drop(columns=removed)
test = pd.get_dummies(test, drop_first=True).drop(columns=removed)

REMOVED: ['zipconvert5_Yes', 'avg_fam_inc', 'months_since_donate', 'med_fam_inc', 'avg_gift']


# Logistic Regression

In [20]:
# Selecting features and target variable for training data
X_train = train.drop(['target_No Donor'], axis =1 )
y_train = train['target_No Donor']
X_test = dev.drop(['target_No Donor'], axis = 1)
y_test = dev['target_No Donor']

# Fitting logistic regression model
glm = sm.GLM(y_train, X_train, family=sm.families.Binomial())
results = glm.fit()

# Summarizing results
print(results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        target_No Donor   No. Observations:                 2400
Model:                            GLM   Df Residuals:                     2385
Model Family:                Binomial   Df Model:                           14
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1644.6
Date:                Mon, 29 Apr 2024   Deviance:                       3289.2
Time:                        19:13:36   Pearson chi2:                 2.42e+03
No. Iterations:                     4   Pseudo R-squ. (CS):            0.01555
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
num_child           0.2487      0.109     

### Dev Results

In [19]:
preds = (results.predict(X_test) >= 0.5).astype(int)
accuracy_score(preds, y_test)

0.53

### Trying with SkLearn

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# Create a logistic regression model instance
logreg = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues occur

# Fit the model on training data
logreg.fit(X_train, y_train)
# Predict on the test data
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.54


In [15]:
lda = LDA(store_covariance=True)
lda.fit(X_train, y_train)

lda_preds = lda.predict(X_test)
confusion_table(lda_preds,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,5
1,34,56


In [16]:
print(f'accuracy_score: {accuracy_score(lda_preds,y_test):.3}')

accuracy_score: 0.625


### (f) Repeat (d) using QDA.


In [27]:
qda = QDA(store_covariance=True)
qda.fit(X_train, y_train)

qda_preds = qda.predict(X_test)
confusion_table(qda_preds,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26,20
1,286,268


In [28]:
print(f'accuracy_score: {accuracy_score(qda_preds,y_test):.3}')

accuracy_score: 0.49


### (g) Repeat (d) using KNN with K = 1.


In [29]:
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train)
knn1_pred = knn1.predict(X_test)
confusion_table(knn1_pred, y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,148,135
1,164,153


In [30]:
print(f'accuracy_score: {accuracy_score(knn1_pred,y_test):.3}')

accuracy_score: 0.502


### (h) Repeat (d) using naive Bayes.


In [31]:
NB = GaussianNB()
NB.fit(X_train, y_train)
nb_preds = NB.predict(X_test)
confusion_table(nb_preds, y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,38,25
1,274,263


In [32]:
print(f'accuracy_score: {accuracy_score(nb_preds, y_test):.3}')

accuracy_score: 0.502
