In [205]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

np.random.seed(42)

In [206]:
wine = datasets.load_wine(as_frame=True)
df_wine_data = wine['data']
labels = wine['target']
df = pd.concat([df_wine_data, labels], axis=1)
print(df.target.value_counts())
print(df.info())
print(df.shape)

1    71
0    59
2    48
Name: target, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline      

In [207]:
# check if there is any missing value
print(df.isnull().values.any())

False


In [208]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df, df['target']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]


# data = df.drop("target", axis=1).copy().to_numpy()
# labels = df["target"].copy().to_numpy()

# train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.3, random_state=42)

print(train_set.target.value_counts()/len(train_set))
print(test_set.target.value_counts()/len(test_set))

train_data = train_set.drop("target", axis=1) 
train_labels = train_set["target"].copy().to_numpy()
print(train_data.shape)

test_data = test_set.drop("target", axis=1) 
test_labels = test_set["target"].copy().to_numpy()
print(test_data.shape)

1    0.403226
0    0.330645
2    0.266129
Name: target, dtype: float64
1    0.388889
0    0.333333
2    0.277778
Name: target, dtype: float64
(124, 13)
(54, 13)


In [209]:
# scaler = MinMaxScaler()
# # scaler = StandardScaler()
# train_scaled = scaler.fit_transform(train_data)

In [210]:
# NOTE: MinMaxScaler -> values are within the range 0 and 1
# NOTE: StandardScaler -> mean of 0 and standard deviation of 1
scaler = MinMaxScaler()
# scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data.values)
# print(scaled_features)
# print(scaled_features.shape)
scaled_df = pd.DataFrame(train_scaled, index=train_set.index, columns=train_data.columns)

# NOTE: if Normalized, max -> 1, min -> 0
print(np.max(scaled_df.alcohol.to_numpy()))
print(np.min(scaled_df.alcohol.to_numpy()))

# NOTE: if Standardized, mean -> 0, std -> 1
# print(np.mean(scaled_df.alcohol.to_numpy()))
# print(np.std(scaled_df.alcohol.to_numpy()))

1.0
0.0


In [211]:
model = SVC(random_state=42)
# specify the parameter combinations to be tested
parameters = [
    {'C': [10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['linear', 'rbf']}
]

# NOTE: becuase the metric has to be maximized we use "neg_mean_squared_error"
grid_search = GridSearchCV(model, parameters, cv=10, return_train_score=True, n_jobs=-1)
grid_search.fit(train_scaled, train_labels)

# print the best parameters
print(grid_search.best_params_)
# the best estimator
print(grid_search.best_estimator_)
# can also get the best results for each run with different feature combinations
cross_val_results = grid_search.cv_results_


{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
SVC(C=10, gamma=0.001, kernel='linear', random_state=42)


In [212]:
train_predictions = grid_search.best_estimator_.predict(train_scaled)
cls_report_train = classification_report(train_labels, train_predictions)
print(cls_report_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        33

    accuracy                           1.00       124
   macro avg       1.00      1.00      1.00       124
weighted avg       1.00      1.00      1.00       124



In [213]:
test_scaled = scaler.transform(test_data.values)
test_predictions = grid_search.best_estimator_.predict(test_scaled)
cls_report_test = classification_report(test_labels, test_predictions)
print(cls_report_test)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        18
           1       1.00      0.95      0.98        21
           2       1.00      1.00      1.00        15

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



In [214]:
# test_scaled = scaler.transform(test_data)
# test_predictions = grid_search.best_estimator_.predict(test_scaled)
# cls_report_test = classification_report(test_labels, test_predictions)
# print(cls_report_test)