In [14]:
import pandas as pd
import numpy as np
import joblib
from sklearn import model_selection
from sklearn import metrics
df = pd.read_csv("../data/processed/wine_data_combined.csv")
cols_to_adjust = [x for x in df.columns if x not in ["quality", "is_red"]]
model_path = "../saved_models/"
retrain = False  # Set to True and rerun the model gridsearches

TODO:
add: knn

In [15]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [16]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [17]:
df_train, df_test = model_selection.train_test_split(
    df, test_size=0.3, random_state=55, stratify=df["quality"]
)

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

stratified = model_selection.StratifiedKFold()
df_transform = ColumnTransformer(
    [(" ", StandardScaler(), cols_to_adjust)],
    remainder="passthrough",
)
df_train = pd.DataFrame(df_transform.fit_transform(df_train), columns=df.columns)
X_train = df_train.drop("quality", axis=1)
y_train = pd.Categorical(df_train["quality"], ordered=True)

# XGBoost

In [19]:
params_xgb = {
    "max_depth": np.arange(3, 20, 2),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "learning_rate": np.logspace(-5, -1, 5),
}

In [20]:
xgboost = xgb.XGBClassifier()
xgb_grid_search = model_selection.GridSearchCV(
    xgboost, params_xgb, n_jobs=cores, cv=stratified
)

In [21]:
if retrain:
    xgb_grid_search.fit(X_train, y_train)
    joblib.dump(xgb_grid_search, f"{model_path}xgb_grid_search.joblib")

xgb_grid_search = joblib.load(f"{model_path}xgb_grid_search.joblib")
xgb_best_params = xgb.XGBClassifier(**xgb_grid_search.best_params_, probability=True)
models = []
models.append(("xgb", xgb_best_params))

np.sum(
    xgb_best_params.fit(X_train, y_train).predict(X_train) == y_train
) / X_train.shape[0]

0.9956014954915329

# Multinomeal Logistic Regression

In [22]:
multi_logistic_reg = LogisticRegression(solver="saga", tol=1e-2, max_iter=500)
params_logistic = {"C": np.logspace(-5, 0, 100), "penalty": ["l1", "l2"]}
logistic_grid_search = model_selection.GridSearchCV(
    multi_logistic_reg, params_logistic, n_jobs=cores, cv=stratified
)

In [23]:
%%capture
# Not all of these converge given the low tolerance I set above
if retrain:
    logistic_grid_search.fit(X_train, y_train)
    joblib.dump(logistic_grid_search, "../saved_models/logistc_grid_search.joblib")

In [70]:
logistic_grid_search = joblib.load(f"{model_path}logistc_grid_search.joblib")
logistic_best_params = LogisticRegression(
    **logistic_grid_search.best_params_, solver="saga", tol=1e-2, max_iter=500

)
models.append(("logistic_reg", logistic_best_params))

In [74]:
yhat = logistic_best_params.fit(X_train, y_train).predict(X_train)
yhat = pd.Categorical(df_train["quality"], ordered=True)
np.sum(
    yhat == y_train
) / X_train.shape[0]
yhat = logistic_best_params.fit(X_train, y_train).predict_proba(X_train)

metrics.roc_auc_score(y_train, yhat, multi_class='ovr')

0.7723537050158155

# Support Vector Machines

In [50]:
support_vector_machine = svm.SVC(gamma="auto", probability=True, kernel="rbf")
params_svm = {"C": np.logspace(-3, 3, 500)}
svm_grid_search = model_selection.GridSearchCV(
    support_vector_machine, params_svm, n_jobs=cores, cv=stratified
)
if retrain:
    svm_grid_search.fit(X_train, y_train)
    joblib.dump(svm_grid_search, f"{model_path}svm_gridsearch.joblib")



In [88]:
svm_grid_search = joblib.load(f"{model_path}svm_gridsearch.joblib")
svm_best_params = svm.SVC(
    gamma="auto", probability=True
)
svm_best_params.fit(X_train, y_train)
models.append(("svm", svm_best_params))
np.sum(svm_best_params.predict(X_train) == y_train) / X_train.shape[0]


metrics.roc_auc_score(y_train, svm_best_params.predict_proba(X_train), multi_class='ovo')

0.8491937710959737

# Ensemble

In [52]:
ensemble = VotingClassifier(models, voting="soft")

In [56]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(models, voting="soft")

if retrain:
    ensemble.fit(X_train, y_train)
    joblib.dump(ensemble, f"{model_path}ensemble.joblib")

ensemble = joblib.load(f"{model_path}ensemble.joblib")
ensemble

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(colsample_bytree=0.5, max_depth=13,
                                            missing=nan,
                                            objective='multi:softprob',
                                            probability=True)),
                             ('logistic_reg',
                              LogisticRegression(C=0.03430469286314919,
                                                 max_iter=500, solver='saga',
                                                 tol=0.01)),
                             ('svm',
                              SVC(C=31.622776601683793, gamma='auto',
                                  probability=True))],
                 voting='soft')

In [57]:
from sklearn.pipeline import Pipeline

if retrain:
    pipeline = Pipeline([("scaler", df_transform), ("ensemble", ensemble)])
    pipeline.fit(X_train, y_train)
    joblib.dump(pipeline, f"{model_path}pipeline.joblib")

In [60]:
pipeline = joblib.load(f"{model_path}pipeline.joblib")
pipeline.predict(X_train)

array([7., 5., 5., ..., 5., 6., 6.])