# IMPORTS 

In [4]:
import pandas as pd
pd.set_option("max_colwidth", None)

import pycaret
import numpy as np
import matplotlib.pyplot as plt
from pycaret.classification import * 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from functions.homebrew import *

# LOAD DATA

In [5]:
df = pd.read_csv('./data/df.csv').drop('Unnamed: 0', axis=1)

In [6]:
train = df[df['type'] == 'train'].drop('type',axis =1)
dev = df[df['type'] == 'dev'].drop('type',axis =1)
test = df[df['type'] == 'test'].drop('type',axis =1)

# VIF

In [21]:
dummies = pd.get_dummies(train, drop_first=True)

kept, removed = remove_high_vif_features(X=dummies.drop('target_No Donor', axis=1), y=dummies['target_No Donor'], vif_threshold=10)
print('REMOVED:', removed)


new_train = dummies.drop(columns=removed)
new_dev = pd.get_dummies(dev, drop_first=True).drop(columns=removed)
new_test = pd.get_dummies(test, drop_first=True).drop(columns=removed)

REMOVED: ['zipconvert5_Yes', 'avg_fam_inc', 'months_since_donate', 'med_fam_inc', 'avg_gift']


In [22]:
new_train['target_No Donor'].value_counts()

1    1213
0    1187
Name: target_No Donor, dtype: int64

# ATTEMPT WITH PYCARET

In [23]:
from pycaret.classification import setup, compare_models

# Initialize and setup the experiment
exp = setup(data=new_train, target='target_No Donor', session_id=27, fix_imbalance=True)

# Compare models to find the best one
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,27
1,Target,target_No Donor
2,Target type,Binary
3,Original data shape,"(2400, 16)"
4,Transformed data shape,"(2418, 16)"
5,Transformed train set shape,"(1698, 16)"
6,Transformed test set shape,"(720, 16)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5494,0.5596,0.5311,0.5573,0.5426,0.099,0.0995,0.029
ridge,Ridge Classifier,0.544,0.5588,0.5476,0.5501,0.5477,0.0878,0.0882,0.007
lda,Linear Discriminant Analysis,0.544,0.5587,0.5476,0.5501,0.5477,0.0878,0.0882,0.008
ada,Ada Boost Classifier,0.5327,0.5456,0.5205,0.5388,0.5281,0.0656,0.0659,0.021
gbc,Gradient Boosting Classifier,0.5095,0.5134,0.4994,0.5148,0.5061,0.0192,0.0192,0.05
lightgbm,Light Gradient Boosting Machine,0.506,0.5094,0.5088,0.5112,0.5093,0.0118,0.0118,0.069
dt,Decision Tree Classifier,0.5048,0.5046,0.5254,0.5096,0.5165,0.0091,0.009,0.009
nb,Naive Bayes,0.5042,0.5311,0.9058,0.5054,0.6485,-0.0004,-0.0003,0.008
svm,SVM - Linear Kernel,0.503,0.5359,0.4577,0.5282,0.3483,0.006,0.0076,0.007
qda,Quadratic Discriminant Analysis,0.4982,0.523,0.9176,0.502,0.6485,-0.0127,-0.0238,0.007


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

# DEV ACCURACTY

In [41]:
preds = predict_model(best, data=new_dev.drop(columns='target_No Donor'))
preds = preds['prediction_label']
actual = new_dev['target_No Donor']
accuracy_score(preds, actual)

0.5366666666666666

# FINE TUNING

In [50]:
tuned_best = tune_model(best, optimize='Accuracy')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5714,0.57,0.4941,0.5915,0.5385,0.1444,0.1465
1,0.5655,0.5449,0.5765,0.5698,0.5731,0.1307,0.1307
2,0.5774,0.5929,0.6353,0.5745,0.6034,0.1536,0.1545
3,0.5357,0.56,0.5059,0.5443,0.5244,0.0721,0.0723
4,0.6131,0.6249,0.5647,0.6316,0.5963,0.2271,0.2284
5,0.5476,0.5402,0.5176,0.557,0.5366,0.0959,0.0961
6,0.5833,0.5926,0.6235,0.5824,0.6023,0.1658,0.1663
7,0.5476,0.5673,0.5412,0.5542,0.5476,0.0954,0.0954
8,0.4821,0.5016,0.5059,0.4886,0.4971,-0.0363,-0.0363
9,0.5119,0.5031,0.4167,0.5147,0.4605,0.0238,0.0243


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [51]:
preds = predict_model(tuned_best, data=new_dev.drop(columns='target_No Donor'))
preds = preds['prediction_label']
actual = new_dev['target_No Donor']
accuracy_score(preds, actual)

0.5316666666666666