In [131]:
import numpy as np
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score, auc, accuracy_score

import warnings
warnings.filterwarnings('ignore')

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=4)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [132]:
# Part a
lrgs = GridSearchCV(
    estimator=LogisticRegression(),
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    param_grid={
        'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 1000, 2500, 5000],
    }
)

lrgs.fit(X_train, y_train)
lrc = lrgs.best_estimator_
y_pred = lrc.predict(X_test)
acc_a = accuracy_score(y_test, y_pred)

print('Best Parameters for Logistic Regression:', lrgs.best_params_)
print('Best Train Accuracy Score for Logistic Regression:', round(lrgs.best_score_, 4))
print('Best Test Accuracy Score for Logistic Regression:', round(acc_a, 4))
print('Confusion Matrix for Logistic Regression:')
print(confusion_matrix(y_test, y_pred))

Best Parameters for Logistic Regression: {'max_iter': 100, 'solver': 'lbfgs'}
Best Train Accuracy Score for Logistic Regression: 0.9802
Best Test Accuracy Score for Logistic Regression: 0.9649
Confusion Matrix for Logistic Regression:
[[34  0]
 [ 4 76]]


In [133]:
# Part b
nbgs = GridSearchCV(
    estimator=GaussianNB(),
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    param_grid={
        'var_smoothing': np.logspace(0, -9, 100)
    }
)

nbgs.fit(X_train, y_train)
nbc = nbgs.best_estimator_
y_pred = nbc.predict(X_test)
acc_b = accuracy_score(y_test, y_pred)

print('Best Parameters for GaussianNB:', nbgs.best_params_)
print('Best Train Accuracy Score for GaussianNB:', round(nbgs.best_score_, 4))
print('Best Test Accuracy Score for GaussianNB:', round(acc_b, 4))
print('Confusion Matrix for GaussianNB:')
print(confusion_matrix(y_test, y_pred))

Best Parameters for GaussianNB: {'var_smoothing': 0.0533669923120631}
Best Train Accuracy Score for GaussianNB: 0.9362
Best Test Accuracy Score for GaussianNB: 0.9298
Confusion Matrix for GaussianNB:
[[32  2]
 [ 6 74]]


In [134]:
# Part c
scores = []
for i in range(X.shape[1]):
    clf = LogisticRegression(**lrgs.best_params_)
    clf.fit(X_train[:, i:i+1], y_train)
    # y_pred = clf.predict_proba(X_test[:, i:i+1])[:, 1]
    y_pred = clf.predict(X_test[:, i:i+1])
    scores.append({
        'feature': i,
        'score': roc_auc_score(y_test, y_pred),
        'model': clf,
    })
scores.sort(key=lambda x: x['score'], reverse=True)

fn = load_breast_cancer().feature_names
rows = list(map(lambda x: (fn[x['feature']], x['score']), scores))
rows = [['Feature', 'AUC Score']] + rows
table = tabulate(rows, headers='firstrow',
                 tablefmt='fancy_grid', floatfmt='.4f')

with open('P2_c.csv', 'w', encoding='utf-8') as f:
    for r in rows:
        f.write(','.join(map(lambda x: str(x).replace(',', ''), r)))
        f.write('\n')

print(table)


╒═════════════════════════╤═════════════╕
│ Feature                 │   AUC Score │
╞═════════════════════════╪═════════════╡
│ mean concave points     │      0.9415 │
├─────────────────────────┼─────────────┤
│ worst concave points    │      0.9290 │
├─────────────────────────┼─────────────┤
│ mean concavity          │      0.8890 │
├─────────────────────────┼─────────────┤
│ mean area               │      0.8765 │
├─────────────────────────┼─────────────┤
│ mean radius             │      0.8640 │
├─────────────────────────┼─────────────┤
│ worst concavity         │      0.8599 │
├─────────────────────────┼─────────────┤
│ worst radius            │      0.8555 │
├─────────────────────────┼─────────────┤
│ worst area              │      0.8555 │
├─────────────────────────┼─────────────┤
│ worst perimeter         │      0.8515 │
├─────────────────────────┼─────────────┤
│ mean perimeter          │      0.8430 │
├─────────────────────────┼─────────────┤
│ area error              │      0

In [135]:
# Part c

# Select best features
sf = [s['feature'] for s in scores[:20]]

lr = LogisticRegression(**lrgs.best_params_)
lr.fit(X_train[:, sf], y_train)
y_pred = lr.predict(X_test[:, sf])
acc_c_lr = accuracy_score(y_test, y_pred)

print('Accuracy Score for Logistic Regression:', round(acc_c_lr, 4))
print('Improvement for Logistic Regression Compared to Part a:', round(acc_c_lr - acc_a, 4))
print('Confusion Matrix for Logistic Regression:')
print(confusion_matrix(y_test, y_pred))
print()

nb = GaussianNB(**nbgs.best_params_)
nb.fit(X_train[:, sf], y_train)
y_pred = nb.predict(X_test[:, sf])
acc_c_nb = accuracy_score(y_test, y_pred)

print('Accuracy Score for GaussianNB:', round(acc_c_nb, 4))
print('Improvement for GaussianNB Compared to Part b:', round(acc_c_nb - acc_b, 4))
print('Confusion Matrix for GaussianNB:')
print(confusion_matrix(y_test, y_pred))

Accuracy Score for Logistic Regression: 0.9737
Improvement for Logistic Regression Compared to Part a: 0.0088
Confusion Matrix for Logistic Regression:
[[34  0]
 [ 3 77]]

Accuracy Score for GaussianNB: 0.9386
Improvement for GaussianNB Compared to Part b: 0.0088
Confusion Matrix for GaussianNB:
[[32  2]
 [ 5 75]]


In [136]:
# Part d
fm = {s['feature']: s['model'] for s in scores}


def extract_features(X):
    X_ = X.copy()
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            X_[i, j] = fm[j].predict_proba(X[i:i+1, j:j+1])[0, 1]
    return X_


X_train_p = extract_features(X_train)
X_test_p = extract_features(X_test)

In [137]:
# Part d
lr = LogisticRegression(**lrgs.best_params_)
lr.fit(X_train_p, y_train)
y_pred = lr.predict(X_test_p)
acc_d_lr = accuracy_score(y_test, y_pred)
auc_d_lr = roc_auc_score(y_test, y_pred)

print('Accuracy Score for Logistic Regression:', round(acc_d_lr, 4))
print('AUC Score for Logistic Regression:', round(auc_d_lr, 4))
print('Improvement for Logistic Regression Compared to Part a:', round(acc_d_lr - acc_a, 4))
print('Improvement for Logistic Regression Compared to Part c:', round(acc_d_lr - acc_c_lr, 4))
print('Confusion Matrix for Logistic Regression:')
print(confusion_matrix(y_test, y_pred))
print()

nb = GaussianNB(**nbgs.best_params_)
nb.fit(X_train_p, y_train)
y_pred = nb.predict(X_test_p)
acc_d_nb = accuracy_score(y_test, y_pred)
auc_d_nb = roc_auc_score(y_test, y_pred)

print('Accuracy Score for GaussianNB:', round(acc_d_nb, 4))
print('AUC Score for GaussianNB:', round(auc_d_nb, 4))
print('Improvement for GaussianNB Compared to Part b:', round(acc_d_nb - acc_b, 4))
print('Improvement for GaussianNB Compared to Part c:', round(acc_d_nb - acc_c_nb, 4))
print('Confusion Matrix for GaussianNB:')
print(confusion_matrix(y_test, y_pred))
print()

mnb = MultinomialNB()
mnb.fit(X_train_p, y_train)
y_pred = mnb.predict(X_test_p)
acc_d_mnb = accuracy_score(y_test, y_pred)
auc_d_mnb = roc_auc_score(y_test, y_pred)

print('Accuracy Score for MultinomialNB:', round(acc_d_mnb, 4))
print('AUC Score for MultinomialNB:', round(auc_d_mnb, 4))
print('Confusion Matrix for MultinomialNB:')
print(confusion_matrix(y_test, y_pred))

Accuracy Score for Logistic Regression: 0.9298
AUC Score for Logistic Regression: 0.95
Improvement for Logistic Regression Compared to Part a: -0.0351
Improvement for Logistic Regression Compared to Part c: -0.0439
Confusion Matrix for Logistic Regression:
[[34  0]
 [ 8 72]]

Accuracy Score for GaussianNB: 0.886
AUC Score for GaussianNB: 0.9103
Improvement for GaussianNB Compared to Part b: -0.0439
Improvement for GaussianNB Compared to Part c: -0.0526
Confusion Matrix for GaussianNB:
[[33  1]
 [12 68]]

Accuracy Score for MultinomialNB: 0.9123
AUC Score for MultinomialNB: 0.8868
Confusion Matrix for MultinomialNB:
[[28  6]
 [ 4 76]]
