In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# import the wine quality dataset (https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009)
wine = pd.read_csv('datasets/wine_quality.csv')
wine = wine[wine['quality'].isin([5, 6, 7])]

# sample 150 wines of each of the selected classes
wine = wine.groupby('quality').apply(lambda x: x.sample(150, random_state=0).reset_index(drop=True))
wine = wine.droplevel(level=0)

# select columns for training and testing
X = wine[wine.columns[:11]]
y = wine['quality']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
29,15.6,0.685,0.76,3.7,0.100,6.0,43.0,1.00320,2.95,0.68,11.2
20,7.4,0.530,0.12,1.9,0.165,4.0,12.0,0.99702,3.26,0.86,9.2
109,8.7,0.480,0.30,2.8,0.066,10.0,28.0,0.99640,3.33,0.67,11.2
137,10.9,0.320,0.52,1.8,0.132,17.0,44.0,0.99734,3.28,0.77,11.5
14,7.5,0.410,0.15,3.7,0.104,29.0,94.0,0.99786,3.14,0.58,9.1
...,...,...,...,...,...,...,...,...,...,...,...
23,7.4,0.360,0.34,1.8,0.075,18.0,38.0,0.99330,3.38,0.88,13.6
42,8.2,0.640,0.27,2.0,0.095,5.0,77.0,0.99747,3.13,0.62,9.1
117,9.0,0.580,0.25,2.8,0.075,9.0,104.0,0.99779,3.23,0.57,9.7
47,7.9,0.660,0.00,1.4,0.096,6.0,13.0,0.99569,3.43,0.58,9.5


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# classify the data using logistic regression
logistic_regression = LogisticRegression(max_iter=10000, random_state=0).fit(X_train, y_train)
print('Train accuracy:', '{:.2f}'.format(logistic_regression.score(X_train, y_train)))
print('Test accuracy:', '{:.2f}'.format(logistic_regression.score(X_test, y_test)))

# classify the data with svm
svm_classifier = SVC().fit(X_train, y_train)
print('\nSupport Vector Machine')
print('Train accuracy:', '{:.2f}'.format(svm_classifier.score(X_train, y_train)))
print('Test accuracy:', '{:.2f}'.format(svm_classifier.score(X_test, y_test)))

Train accuracy: 0.63
Test accuracy: 0.62

Support Vector Machine
Train accuracy: 0.48
Test accuracy: 0.46


In [36]:
from sklearn.decomposition import PCA

# reduce dimensions with PCA
pca = PCA(n_components=2).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# train a new classifier for the PCA-reduced data
logistic_regression = LogisticRegression(random_state=0).fit(X_train_pca, y_train)
print('Logistic Regression')
print('Train accuracy:', '{:.2f}'.format(logistic_regression.score(X_train_pca, y_train)))
print('Test accuracy:', '{:.2f}'.format(logistic_regression.score(X_test_pca, y_test)))

# train a new classifier for the PCA-reduced data
svm_classifier = SVC().fit(X_train_pca, y_train)
print('\nSupport Vector Machine')
print('Train accuracy:', '{:.2f}'.format(svm_classifier.score(X_train_pca, y_train)))
print('Test accuracy:', '{:.2f}'.format(svm_classifier.score(X_test_pca, y_test)))

Logistic Regression
Train accuracy: 0.46
Test accuracy: 0.46

Support Vector Machine
Train accuracy: 0.48
Test accuracy: 0.48


In [37]:
from sklearn.manifold import SpectralEmbedding

# reduce dimensions with SE
X_train_se = SpectralEmbedding(n_components=2).fit_transform(X_train)

# train a new classifier for the SE-reduced data
logistic_regression = LogisticRegression(random_state=0).fit(X_train_se, y_train)
print('Logistic Regression')
print('Train accuracy:', '{:.2f}'.format(logistic_regression.score(X_train_se, y_train)))

# train a new classifier for the SE-reduced data
svm_classifier = SVC().fit(X_train_se, y_train)
print('\nSupport Vector Machine')
print('Train accuracy:', '{:.2f}'.format(svm_classifier.score(X_train_se, y_train)))

Logistic Regression
Train accuracy: 0.34

Support Vector Machine
Train accuracy: 0.48
