In [2]:
from preamble import *

In [3]:
from sklearn.datasets import load_wine

wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target

print(X.head())
print(X.describe())
print(f"Target classes: {set(y)}")

   alcohol  malic_acid   ash  alcalinity_of_ash  ...  color_intensity   hue  \
0    14.23        1.71  2.43               15.6  ...             5.64  1.04   
1    13.20        1.78  2.14               11.2  ...             4.38  1.05   
2    13.16        2.36  2.67               18.6  ...             5.68  1.03   
3    14.37        1.95  2.50               16.8  ...             7.80  0.86   
4    13.24        2.59  2.87               21.0  ...             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                          3.92   1065.0  
1                          3.40   1050.0  
2                          3.17   1185.0  
3                          3.45   1480.0  
4                          2.93    735.0  

[5 rows x 13 columns]
       alcohol  malic_acid     ash  alcalinity_of_ash  ...  color_intensity  \
count   178.00      178.00  178.00             178.00  ...           178.00   
mean     13.00        2.34    2.37              19.49  ...             5.06   
std    

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

mm_scaler = MinMaxScaler()
X_train_minmax = mm_scaler.fit_transform(X_train)
X_test_minmax = mm_scaler.fit_transform(X_test)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [6]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)

selector = SelectKBest(score_func=f_classif, k=8)
X_trian_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.fit_transform(X_test, y_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(max_iter=5000)
svm = SVC(kernel='linear', probability=True)
rf = RandomForestClassifier(random_state=22)

lr.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {'LogisticRegression\t' : lr, 'SVM\t\t\t' : svm, 'RandomForestClassifier\t' : rf}
feature_engineerings = {'StandardSclaer':[X_train_scaled, X_test_scaled],
                        'MinMaxScaler':[X_train_minmax, X_test_minmax],
                        'PolynomialFeatures':[X_train_poly, X_test_poly],
                        'PCA':[X_train_pca, X_test_pca],
                        'SelectKBest':[X_trian_selected, X_test_selected]}

for method, train_test_Xset in feature_engineerings.items():
    print(method)
    for name, model in models.items():
        model.fit(train_test_Xset[0], y_train)
        y_pred = model.predict(train_test_Xset[1])
        print(f"{name} - Accuracy: {accuracy_score(y_test, y_pred):.4f}, "
            f"Precision: {precision_score(y_test, y_pred, average='weighted', zero_division=0):.4f}, "
            f"Recall: {recall_score(y_test, y_pred, average='weighted', zero_division=0):.4f}, "
            f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print('\n')

StandardSclaer
LogisticRegression	 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725
SVM			 - Accuracy: 0.9444, Precision: 0.9472, Recall: 0.9444, F1 Score: 0.9445
RandomForestClassifier	 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725


MinMaxScaler
LogisticRegression	 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725
SVM			 - Accuracy: 0.9444, Precision: 0.9495, Recall: 0.9444, F1 Score: 0.9439
RandomForestClassifier	 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725


PolynomialFeatures
LogisticRegression	 - Accuracy: 0.8611, Precision: 0.8716, Recall: 0.8611, F1 Score: 0.8626
SVM			 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725
RandomForestClassifier	 - Accuracy: 0.9722, Precision: 0.9750, Recall: 0.9722, F1 Score: 0.9725


PCA
LogisticRegression	 - Accuracy: 0.9444, Precision: 0.9472, Recall: 0.9444, F1 Score: 0.9445
SVM			 - Accuracy: 0.9167, Precision: 0.9249, Reca

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid = GridSearchCV(RandomForestClassifier(random_state=22), param_grid, cv=5, scoring='accuracy')

In [12]:
for method, train_test_Xset in feature_engineerings.items():
    grid.fit(train_test_Xset[0], y_train)
    print(f"RandomForest using {method}\nBest parameters: {grid.best_params_}, Best score: {grid.best_score_}")

RandomForest using StandardSclaer
Best parameters: {'max_depth': None, 'n_estimators': 50}, Best score: 0.9788177339901478
RandomForest using MinMaxScaler
Best parameters: {'max_depth': None, 'n_estimators': 50}, Best score: 0.9788177339901478
RandomForest using PolynomialFeatures
Best parameters: {'max_depth': None, 'n_estimators': 50}, Best score: 0.9719211822660098
RandomForest using PCA
Best parameters: {'max_depth': None, 'n_estimators': 50}, Best score: 0.9578817733990148
RandomForest using SelectKBest
Best parameters: {'max_depth': None, 'n_estimators': 100}, Best score: 0.9788177339901478
