In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



In [2]:
X = pd.read_csv('data/X_KBest.csv')
y = pd.read_csv('data/Y_res.csv')
y = y.values

In [3]:
print(X.shape)
print(y.shape)

(2466, 20)
(2466, 1)


In [4]:
#import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

# import model metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [6]:
def print_report(Y_test, Y_pred):
    print('Accuracy Score: ', accuracy_score(Y_test, Y_pred))
    print('Confusion Matrix: \n', confusion_matrix(Y_test, Y_pred))
    print('Classification Report: \n', classification_report(Y_test, Y_pred))

In [7]:
#AdaBoost Implementation training
itr = 50
n = len(X_train)
w = np.full(n, (1/n))
models = []
alpha = []

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
for i in range(itr):
    model = DecisionTreeClassifier(max_depth=2)
    train_data = X_train.sample(n, replace=False, weights=w)
    print(len(np.unique(train_data.index, return_counts=False)))
    train_label = y_train[train_data.index]
    model.fit(train_data, train_label)
    Y_pred = model.predict(X_train)
    error = 0
    for j in range(n):
        if Y_pred[j] != y_train[j]:
            error += w[j]
    alpha.append(0.5 * np.log((1-error)/error))
    for j in range(n):
        if Y_pred[j] == y_train[j]:
            w[j] = w[j] * np.exp(-alpha[i])
        else:
            w[j] = w[j] * np.exp(alpha[i])
    w = w/sum(w)
    models.append(model)
    print('Iteration: ', i+1, ' Error: ', error, ' Alpha: ', alpha[i])


1972
Iteration:  1  Error:  0.2707910750507087  Alpha:  0.49530634966040166
1972
Iteration:  2  Error:  0.5000000000000011  Alpha:  -2.2204460492503182e-15
1972
Iteration:  3  Error:  0.49999999999999417  Alpha:  1.1657341758564008e-14
1972
Iteration:  4  Error:  0.500000000000029  Alpha:  -5.795364188543653e-14
1972
Iteration:  5  Error:  0.4999999999999712  Alpha:  5.773159728050481e-14
1972
Iteration:  6  Error:  0.5000000000000303  Alpha:  -6.061817714453722e-14
1972
Iteration:  7  Error:  0.4999999999999715  Alpha:  5.695444116326729e-14
1972
Iteration:  8  Error:  0.5000000000000303  Alpha:  -6.061817714453722e-14
1972
Iteration:  9  Error:  0.49999999999997147  Alpha:  5.706546346572979e-14
1972
Iteration:  10  Error:  0.5000000000000294  Alpha:  -5.884182030513676e-14
1972
Iteration:  11  Error:  0.4999999999999715  Alpha:  5.695444116326729e-14
1972
Iteration:  12  Error:  0.5000000000000294  Alpha:  -5.884182030513676e-14
1972
Iteration:  13  Error:  0.49999999999997147  Alph

In [8]:
#testing AdaBoost
t_n = len(X_test)
Y_pred = []
for i in range(t_n):
    pred = 0
    for m in range(len(models)):
        pred += alpha[m] * models[m].predict([X_test.iloc[i]])
    if pred > 0:
        Y_pred.append(1)
    else:
        Y_pred.append(0)

print_report(y_test, Y_pred)


Accuracy Score:  0.7307692307692307
Confusion Matrix: 
 [[182  63]
 [ 70 179]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.72      0.74      0.73       245
           1       0.74      0.72      0.73       249

    accuracy                           0.73       494
   macro avg       0.73      0.73      0.73       494
weighted avg       0.73      0.73      0.73       494



In [9]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=50)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
print_report(y_test, Y_pred)

Accuracy Score:  0.8502024291497976
Confusion Matrix: 
 [[211  34]
 [ 40 209]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.86      0.85       245
           1       0.86      0.84      0.85       249

    accuracy                           0.85       494
   macro avg       0.85      0.85      0.85       494
weighted avg       0.85      0.85      0.85       494



In [10]:


X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)



In [11]:
def create_model(model, X_train, y_train, w, samples_len):
    train_data = X_train.sample(samples_len, replace=False, weights=w)
    train_label = y_train[train_data.index]
    model.fit(train_data, train_label)
    Y_pred = model.predict(X_train)
    error = 0
    for j in range(len(X_train)):
        if Y_pred[j] != y_train[j]:
            error += w[j]

    w_temp = w.copy()
    if error != 0:
        alpha = 0.5 * np.log((1-error)/error)
        for j in range(n):
            if Y_pred[j] == y_train[j]:
                w_temp[j] = w[j] * np.exp(-alpha)
            else:
                w_temp[j] = w[j] * np.exp(alpha)
    return model, error, w_temp


In [12]:
iter = 11
n = len(X_train)
samples_len = n//2
w = np.full(n, (1/n))
models = []
alpha = []
n1 = 1/n

for i in range(iter):
    model1, error1, w1 = create_model(LogisticRegression(C=0.1, max_iter=10, penalty='l2'), X_train, y_train, w, samples_len)
    model2, error2, w2 = create_model(KNeighborsClassifier(algorithm='auto', n_neighbors=3, weights='uniform'), X_train, y_train, w, samples_len)
    model3, error3, w3 = create_model(DecisionTreeClassifier(criterion='entropy', max_depth=2, splitter='best'), X_train, y_train, w, samples_len)
    model4, error4, w4 = create_model(MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes=(20, 30, 30, 20), learning_rate='adaptive', solver='adam'), X_train, y_train, w, samples_len)
    model5, error5, w5 = create_model(SVC(C=10, gamma=0.1, kernel='linear'), X_train, y_train, w, samples_len)
    model6, error6, w6 = create_model(RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=10), X_train, y_train, w, samples_len)
    model7, error7, w7 = create_model(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=50), X_train, y_train, w, samples_len)

    error = error1 + error2 + error3 + error4 + error5 + error6 + error7
    error = error/7
    alpha_round = 0.5 * np.log((1-error)/error)
    w = w1 + w2 + w3 + w4 + w5 + w6 + w7
    w = w/sum(w)

    models.append([model1, model2, model3, model4, model5, model6, model7])

    print('Login Regression Accuracy: ', accuracy_score(y_train, model1.predict(X_train)))
    print('KNN Accuracy: ', accuracy_score(y_train, model2.predict(X_train)))
    print('Decision Tree Accuracy: ', accuracy_score(y_train, model3.predict(X_train)))
    print('MLP Accuracy: ', accuracy_score(y_train, model4.predict(X_train)))
    print('SVM Accuracy: ', accuracy_score(y_train, model5.predict(X_train)))
    print('Random Forest Accuracy: ', accuracy_score(y_train, model6.predict(X_train)))
    print('AdaBoost Accuracy: ', accuracy_score(y_train, model7.predict(X_train)))
    print('Iteration: ', i+1, 'Weighted Error: ', error)
    print('\n\n')



Login Regression Accuracy:  0.579107505070994
KNN Accuracy:  0.7956389452332657
Decision Tree Accuracy:  0.723630831643002
MLP Accuracy:  0.5831643002028397
SVM Accuracy:  0.800709939148073
Random Forest Accuracy:  0.7601419878296146
AdaBoost Accuracy:  0.9143002028397565
Iteration:  1 Weighted Error:  0.2633294697189225



Login Regression Accuracy:  0.5704868154158215
KNN Accuracy:  0.7834685598377282
Decision Tree Accuracy:  0.72920892494929
MLP Accuracy:  0.5983772819472617
SVM Accuracy:  0.7931034482758621
Random Forest Accuracy:  0.7723123732251521
AdaBoost Accuracy:  0.8985801217038539
Iteration:  2 Weighted Error:  0.3096642011737805



Login Regression Accuracy:  0.5831643002028397
KNN Accuracy:  0.7900608519269777
Decision Tree Accuracy:  0.6926977687626775
MLP Accuracy:  0.5253549695740365
SVM Accuracy:  0.7925963488843814
Random Forest Accuracy:  0.7814401622718052
AdaBoost Accuracy:  0.907707910750507
Iteration:  3 Weighted Error:  0.3374448953443154



Login Regression Ac

In [13]:
# 1. Every model for each iteration accua
# 2. Ave error
# 3. 

In [14]:
#save models and alpha
import pickle
with open('models/models.pkl', 'wb') as f:
    pickle.dump(models, f)
with open('models/alpha.pkl', 'wb') as f:
    pickle.dump(alpha, f)



In [15]:
# #load models and alpha
# import pickle
# with open('models/models.pkl', 'rb') as f:
#     models = pickle.load(f)

# with open('models/alpha.pkl', 'rb') as f:
#     alpha = pickle.load(f)

In [16]:
def get_pred(x, models):
    pred = []
    for i in range(len(models)):
        pred.append(models[i].predict(x))
    pred = np.array(pred)
    u, c = np.unique(pred, return_counts=True)
    f_pred = u[np.argmax(c)]
    return f_pred

In [17]:
test_n = len(X_test)
Y_pred = np.zeros(test_n)
for i in range(test_n):
    x = X_test.iloc[i].values.reshape(1, -1)
    temp = []
    for m in range(len(models)):
        temp.append(get_pred(x, models[m]))

    u, c = np.unique(temp, return_counts=True)
    Y_pred[i] = u[np.argmax(c)]

print_report(y_test, Y_pred)

Accuracy Score:  0.8461538461538461
Confusion Matrix: 
 [[215  30]
 [ 46 203]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       245
           1       0.87      0.82      0.84       249

    accuracy                           0.85       494
   macro avg       0.85      0.85      0.85       494
weighted avg       0.85      0.85      0.85       494



# Unbiased Data

In [18]:
X = pd.read_csv('data/X_Unbiased.csv')
y = pd.read_csv('data/Y_Unbiased.csv')
y = y.values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [20]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [21]:
iter = 11
n = len(X_train)
samples_len = n//2
w = np.full(n, (1/n))
models = []
alpha = []
n1 = 1/n

for i in range(iter):
    model1, error1, w1 = create_model(LogisticRegression(C=0.1, max_iter=10, penalty='l2'), X_train, y_train, w, samples_len)
    model2, error2, w2 = create_model(KNeighborsClassifier(algorithm='auto', n_neighbors=3, weights='uniform'), X_train, y_train, w, samples_len)
    model3, error3, w3 = create_model(DecisionTreeClassifier(criterion='entropy', max_depth=2, splitter='best'), X_train, y_train, w, samples_len)
    model4, error4, w4 = create_model(MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes=(20, 30, 30, 20), learning_rate='adaptive', solver='adam'), X_train, y_train, w, samples_len)
    model5, error5, w5 = create_model(SVC(C=10, gamma=0.1, kernel='linear'), X_train, y_train, w, samples_len)
    model6, error6, w6 = create_model(RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=10), X_train, y_train, w, samples_len)
    model7, error7, w7 = create_model(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=50), X_train, y_train, w, samples_len)

    error = error1 + error2 + error3 + error4 + error5 + error6 + error7
    error = error/7
    alpha_round = 0.5 * np.log((1-error)/error)
    w = w1 + w2 + w3 + w4 + w5 + w6 + w7
    w = w/sum(w)


    models.append([model1, model2, model3, model4, model5, model6, model7])
    print('Login Regression Accuracy: ', accuracy_score(y_train, model1.predict(X_train)))
    print('KNN Accuracy: ', accuracy_score(y_train, model2.predict(X_train)))
    print('Decision Tree Accuracy: ', accuracy_score(y_train, model3.predict(X_train)))
    print('MLP Accuracy: ', accuracy_score(y_train, model4.predict(X_train)))
    print('SVM Accuracy: ', accuracy_score(y_train, model5.predict(X_train)))
    print('Random Forest Accuracy: ', accuracy_score(y_train, model6.predict(X_train)))
    print('AdaBoost Accuracy: ', accuracy_score(y_train, model7.predict(X_train)))
    print('Iteration: ', i+1, 'Weighted Error: ', error)
    print('\n\n')



Login Regression Accuracy:  0.5841784989858012
KNN Accuracy:  0.7941176470588235
Decision Tree Accuracy:  0.7317444219066938
MLP Accuracy:  0.5821501014198783
SVM Accuracy:  0.8017241379310345
Random Forest Accuracy:  0.7616632860040567
AdaBoost Accuracy:  0.8950304259634888
Iteration:  1 Weighted Error:  0.26419878296146093



Login Regression Accuracy:  0.5836713995943205
KNN Accuracy:  0.8002028397565923
Decision Tree Accuracy:  0.72920892494929
MLP Accuracy:  0.5456389452332657
SVM Accuracy:  0.7865111561866126
Random Forest Accuracy:  0.7728194726166329
AdaBoost Accuracy:  0.8975659229208925
Iteration:  2 Weighted Error:  0.31192209346807415



Login Regression Accuracy:  0.5851926977687627
KNN Accuracy:  0.8017241379310345
Decision Tree Accuracy:  0.72920892494929
MLP Accuracy:  0.5938133874239351
SVM Accuracy:  0.8047667342799188
Random Forest Accuracy:  0.7981744421906694
AdaBoost Accuracy:  0.8899594320486816
Iteration:  3 Weighted Error:  0.3330644149875073



Login Regressio

In [22]:
#save models and alpha
import pickle
with open('models/models_unbiased.pkl', 'wb') as f:
    pickle.dump(models, f)
with open('models/alpha_unbiased.pkl', 'wb') as f:
    pickle.dump(alpha, f)



In [23]:
test_n = len(X_test)
Y_pred = np.zeros(test_n)
for i in range(test_n):
    x = X_test.iloc[i].values.reshape(1, -1)
    temp = []
    for m in range(len(models)):
        temp.append(get_pred(x, models[m]))

    u, c = np.unique(temp, return_counts=True)
    Y_pred[i] = u[np.argmax(c)]

print_report(y_test, Y_pred)

Accuracy Score:  0.8340080971659919
Confusion Matrix: 
 [[214  31]
 [ 51 198]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.87      0.84       245
           1       0.86      0.80      0.83       249

    accuracy                           0.83       494
   macro avg       0.84      0.83      0.83       494
weighted avg       0.84      0.83      0.83       494

