# Feature Engineering Testing
In this part, we will take a look at our variables and determine which feature engineering methods prep our data in a way that is easier for our models to interpret.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing

In [2]:
red_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
white_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

red_df = pd.read_csv(red_url, sep=";")
white_df = pd.read_csv(white_url, sep=";")
red_df["is_red"] = 1
white_df["is_red"] = 0
df_raw = pd.concat([red_df, white_df])
df_raw.columns = [x.replace(" ", "_") for x in df_raw.columns]

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [4]:
from sklearn.model_selection import train_test_split
df_train_raw, df_test_raw = train_test_split(df_raw, test_size=0.3, random_state=55, stratify=df_raw["quality"])

In [5]:
def get_auc(df, n_iters=250, target="quality", split=0.4):
    auc_list = []
    for seed in range(n_iters):
        X = df.drop(target, axis=1)
        y = pd.Categorical(df[target], ordered=True)
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split, random_state=seed, stratify=y)
        
        phat = LogisticRegression(max_iter=5000, solver="saga")\
            .fit(X_train, y_train)\
            .predict_proba(X_val)
        auc_list.append(roc_auc_score(y_val, phat, multi_class="ovo")) 
    return auc_list

In [6]:
from sklearn.compose import ColumnTransformer

cols_to_adjust = [x for x in df_raw.columns if x not in ['quality', 'is_red']]

In [7]:
def data_transformer(df, scaler, cols_adj=cols_to_adjust, cols=df_raw.columns):
    transform = ColumnTransformer([(' ', scaler, cols_adj)], remainder='passthrough')
    return pd.DataFrame(transform.fit_transform(df), columns=cols)

In [8]:
baseline_results = get_auc(df_train_raw)

In [None]:
df_std_train = data_transformer(df_train_raw, preprocessing.StandardScaler())

std_scale_results = get_auc(df_std_train)

In [None]:
df_min_max = data_transformer(df_train_raw, preprocessing.MinMaxScaler())

min_max_results = get_auc(df_min_max)

In [None]:
df_robust_scaler = data_transformer(df_train_raw, preprocessing.RobustScaler(quantile_range=(.2, .8)))
# Drops outliers
robscale_results = get_auc(df_robust_scaler)

In [None]:
df_pwr_transform = data_transformer(df_train_raw, preprocessing.PowerTransformer())
#  yao johnson
pwr_transform_results = get_auc(df_pwr_transform)

  loglike = -n_samples / 2 * np.log(x_trans.var())


In [None]:
df_quantile_transfmr = data_transformer(df_train_raw, preprocessing.QuantileTransformer())

quant_transfmr_results = get_auc(df_quantile_transfmr)

In [None]:
from beepy import beep
beep()

In [None]:
res_zip = zip(baseline_results, std_scale_results, min_max_results, robscale_results, pwr_transform_results, quant_transfmr_results)

pd.DataFrame(res_zip).to_csv("../result_logs/transformations.csv")