In [1]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, scale
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

np.random.seed(42)

In [2]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

In [3]:
X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [12]:
def all_feature_ensemble_report_hard():
    
    dt_clf = DecisionTreeClassifier(max_depth=50, min_samples_leaf=1)
    dt_clf.fit(X_train, y_train)
    log_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("log", LogisticRegression(C=10, max_iter=3000, solver='lbfgs', tol=0.001)),
    ])
    log_clf.fit(X_train, y_train)
    knn_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')),
    ])
    knn_clf.fit(X_train, y_train)
    svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=3, gamma=0.7, kernel='rbf', max_iter=20000)),
    ])
    svm_clf.fit(X_train, y_train)
    
    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('svc', svm_clf), ('dt', dt_clf), ('knn', knn_clf)],
        voting='hard')
    voting_clf.fit(X_train, y_train)
    
    classifiers = [dt_clf, log_clf, knn_clf, svm_clf, voting_clf]
    train_accs = []
    test_accs = []
    precs = []
    recalls = []
    f1s = []
    results = 0
    for c in classifiers:
        y_train_predicted = c.predict(X_train)
        y_test_predicted = c.predict(X_test)
        
        train_acc = np.mean(y_train_predicted == y_train)
        test_acc = np.mean(y_test_predicted == y_test)
        
        precision_test = precision_score(y_test, y_test_predicted) 
        recall_test = recall_score(y_test, y_test_predicted)
        f1_test = f1_score(y_test, y_test_predicted)
        
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        precs.append(precision_test)
        recalls.append(recall_test)
        f1s.append(f1_test)
    results = pd.DataFrame({'train_acc': train_accs, 'test_acc': test_accs, 'precision': precs, 'recall': recalls, 'f1_score': f1s})
    results.index = ['decision_tree', 'logistic_regression', 'knn', 'svm_rbf', 'ensemble']
    return results

In [18]:
def all_feature_ensemble_report_soft():
    
    dt_clf = DecisionTreeClassifier(max_depth=50, min_samples_leaf=1)
    dt_clf.fit(X_train, y_train)
    log_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("log", LogisticRegression(C=10, max_iter=3000, solver='lbfgs', tol=0.001)),
    ])
    log_clf.fit(X_train, y_train)
    knn_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=9, p=1, weights='distance')),
    ])
    knn_clf.fit(X_train, y_train)
    svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=3, gamma=0.7, kernel='rbf', max_iter=20000, probability=True)),
    ])
    svm_clf.fit(X_train, y_train)
    
    voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('svc', svm_clf), ('dt', dt_clf), ('knn', knn_clf)],
        voting='soft')
    voting_clf.fit(X_train, y_train)
    
    classifiers = [dt_clf, log_clf, knn_clf, svm_clf, voting_clf]
    train_accs = []
    test_accs = []
    precs = []
    recalls = []
    f1s = []
    results = 0
    for c in classifiers:
        y_train_predicted = c.predict(X_train)
        y_test_predicted = c.predict(X_test)
        
        train_acc = np.mean(y_train_predicted == y_train)
        test_acc = np.mean(y_test_predicted == y_test)
        
        precision_test = precision_score(y_test, y_test_predicted) 
        recall_test = recall_score(y_test, y_test_predicted)
        f1_test = f1_score(y_test, y_test_predicted)
        
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        precs.append(precision_test)
        recalls.append(recall_test)
        f1s.append(f1_test)
    results = pd.DataFrame({'train_acc': train_accs, 'test_acc': test_accs, 'precision': precs, 'recall': recalls, 'f1_score': f1s})
    results.index = ['decision_tree', 'logistic_regression', 'knn', 'svm_rbf', 'ensemble']
    return results

In [20]:
results_hard = all_feature_ensemble_report_hard()
results_hard

Unnamed: 0,train_acc,test_acc,precision,recall,f1_score
decision_tree,1.0,0.788776,0.847692,0.836115,0.841864
logistic_regression,0.757274,0.74898,0.777927,0.877086,0.824536
knn,1.0,0.826531,0.856934,0.890744,0.873512
svm_rbf,0.982899,0.816327,0.841655,0.895296,0.867647
ensemble,0.997448,0.830612,0.874052,0.874052,0.874052


In [21]:
results_soft = all_feature_ensemble_report_soft()
results_soft

Unnamed: 0,train_acc,test_acc,precision,recall,f1_score
decision_tree,1.0,0.788776,0.843465,0.842185,0.842825
logistic_regression,0.757274,0.74898,0.777927,0.877086,0.824536
knn,1.0,0.826531,0.856934,0.890744,0.873512
svm_rbf,0.982899,0.816327,0.841655,0.895296,0.867647
ensemble,1.0,0.834694,0.855508,0.907436,0.880707
