In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn import model_selection

df = pd.read_csv("../data/processed/wine_data_combined.csv")
cols_to_adjust = [x for x in df.columns if x not in ["quality", "is_red"]]
model_path = "../saved_models/"
retrain = True  # Set to True and rerun the model gridsearches

TODO:
add: knn, naieve bayes

In [2]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [3]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [4]:
df_train, df_test = model_selection.train_test_split(
    df, test_size=0.3, random_state=55, stratify=df["quality"]
)

In [5]:
from sklearn.preprocessing import StandardScaler 
from sklearn.compose import ColumnTransformer

stratified = model_selection.StratifiedKFold()
df_transform = ColumnTransformer(
    [(" ", StandardScaler(), cols_to_adjust)],
    remainder="passthrough",
)
df_train = pd.DataFrame(df_transform.fit_transform(df_train), columns=df.columns)
X_train = df_train.drop("quality", axis=1)
y_train = pd.Categorical(df_train["quality"], ordered=True)

# XGBoost

In [6]:
params_xgb = {'max_depth': np.arange(3, 20, 2),
             'colsample_bytree' : np.arange(0.5, 1, 0.1),
             'gamma': np.arange(1, 9, 0.1),
             "learning_rate": np.logspace(-5, -1, 20)}

In [7]:
xgboost = xgb.XGBClassifier()
xgb_grid_search = model_selection.GridSearchCV(
    xgboost, params_xgb, n_jobs=cores, cv=stratified
)


In [8]:
if retrain:
    xgb_grid_search.fit(X_train, y_train)
    joblib.dump(xgb_grid_search, f"{model_path}xgb_grid_search.joblib")

xgb_grid_search = joblib.load(f"{model_path}xgb_grid_search.joblib")
xgb_best_params = xgb.XGBClassifier(**xgb_grid_search.best_params_)
models = []
models.append(("xgb", xgb_best_params))

np.sum(xgb_best_params.fit(X_train, y_train).predict(X_train) == y_train) / X_train.shape[0]



# Multinomeal Logistic Regression

In [None]:
multi_logistic_reg = LogisticRegression(solver="saga", tol=1e-2, max_iter=500)
params_logistic = {"C": np.logspace(-5, 0, 100), "penalty": ["l1", "l2"]}
logistic_grid_search = model_selection.GridSearchCV(
    multi_logistic_reg, params_logistic, n_jobs=cores, cv=stratified
)

In [None]:
%%capture
# Not all of these converge given the low tolerance I set above
if retrain:
    logistic_grid_search.fit(X_train, y_train)
    joblib.dump(logistic_grid_search, "../saved_models/logistc_grid_search.joblib")

In [None]:
logistic_grid_search = joblib.load(f"{model_path}logistc_grid_search.joblib")
logistic_best_params = LogisticRegression(
    **logistic_grid_search.best_params_, solver="saga", tol=1e-2, max_iter=500
)
models.append(("logistic_reg", logistic_best_params))

np.sum(logistic_best_params.fit(X_train, y_train).predict(X_train) == y_train) / X_train.shape[0]

0.5396965031889158

# Support Vector Machines

In [None]:
support_vector_machine = svm.SVC()
params_svm = {'C': np.logspace(-3, 3, 50), 'gamma': np.logspace(1, -5, 50), 'kernel': ['rbf',  'sigmoid']}
svm_grid_search = model_selection.GridSearchCV(support_vector_machine, params_svm, n_jobs=cores, cv=stratified)
if retrain:
    svm_grid_search.fit(X_train, y_train)
    joblib.dump(svm_grid_search, f"{model_path}svm_gridsearch.joblib")

In [None]:
svm_grid_search = joblib.load(f"{model_path}svm_gridsearch.joblib")
svm_best_params = svm.SVC(**svm_grid_search.best_params_)
np.sum(svm_grid_search.predict(X_train) == y_train) / X_train.shape[0]
models.append(("svm", svm_best_params))

# Naïve Bayes

In [None]:

params_gnb = {'var_smoothing': np.logspace(0,-9, num=1000)}
gnb = GaussianNB()
nb_search = model_selection.GridSearchCV(gnb, params_gnb, n_jobs=cores, cv=stratified)

if retrain:
    nb_search.fit(X_train, y_train)
    joblib.dump(nb_search, f"{model_path}gnb.joblib")
nb_search = joblib.load(f"{model_path}gnb.joblib")
models.append(("gnb", gnb))
np.sum(gnb.fit(X_train, y_train).predict(X_train) == y_train) / X_train.shape[0]



0.35363976248075657

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(models, voting="soft")

if retrain:
    ensemble.fit(X_train, y_train)
    joblib.dump(ensemble, f"{model_path}ensemble.joblib")

ensemble = joblib.load(f"{model_path}ensemble.joblib")

In [None]:
from sklearn.pipeline import Pipeline

if retrain:
    pipeline = Pipeline([("scaler", df_transform), ("ensemble", ensemble)])
    pipeline.fit(X_train, y_train)
    joblib.dump(pipeline, f"{model_path}pipeline.joblib")

In [None]:
pipeline = joblib.load(f"{model_path}pipeline.joblib")
np.sum(pipeline.fit(X_train, y_train).predict(X_train) == y_train) / X_train.shape[0]

0.34154387508247197