# Feature Engineering Testing
In this part, we will take a look at our variables and determine which feature engineering methods prep our data in a way that is easier for our models to interpret.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [10]:
red_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
white_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

red_df = pd.read_csv(red_url, sep=";")
white_df = pd.read_csv(white_url, sep=";")
red_df["is_red"] = 1
white_df["is_red"] = 0
df_raw = pd.concat([red_df, white_df])
df_raw.columns = [x.replace(" ", "_") for x in df_raw.columns]

In [6]:
from sklearn.model_selection import train_test_split

df_train_raw, df_test_test = train_test_split(df_raw, test_size=0.2, stratify=df_raw["quality"])

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
def get_auc(df, seed, target="quality", split=0.4):

    X = df.drop(target, axis=1)
    y = pd.Categorical(df[target], ordered=True)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split, random_state=seed, stratify=y)
    
    phat = LogisticRegression(max_iter=5000, solver="saga")\
        .fit(X_train, y_train)\
        .predict_proba(X_val)
    return roc_auc_score(y_val, phat, multi_class="ovo") 

In [None]:
auc_list = []
for seed in range(100):
    auc = get_auc(df_train, seed)
    auc_list.append(auc)
    print(auc)


0.5805827034020241




0.6295859960131898




0.6083587118455158




0.6132498477910834
0.6185813744363163
0.6024653784594871




0.6403211325853567
0.6034701716544334




0.641449560652575




0.6059551712908444
0.5726373442767267
0.5978714841340667
0.5782809979988455
0.6038808382406888
0.5811720032816435
0.6025163330883011




0.5992812629011274
0.6457453533585964
0.6313920639820478
0.6129766620202405
0.5622475777960515
0.5845389858458901




0.623075407225785
0.5903846372584854
0.5583281917051734




0.6026871814726787




0.6174868895064017
0.5911815107035805
0.6027571721152636
0.6056703101282661




0.659954409082519
0.6044171138722705
0.5786813041617083




0.6382417245894321


KeyboardInterrupt: 