## RED WINE QUALITY CLASSIFICATION 🍷
Bu notebookta sınıflandırma (classification) yaparak, kırmızı şarapların kalitesini tahmin etmeye çalışacağız. Veri setinde düşük (3-4), orta (5-6) ve yüksek (7-8) olarak farklı kalitelerde şaraplar bulunuyor. Amacımız, veri setindeki özellikleri kullanarak, şarapların kalite puanlarını doğru bir şekilde sınıflandırmak olacaktır.

In [1]:
# gerekli kütüphaneler
import numpy as np  # sayısal işlemler 
import pandas as pd  # veri manipülasyonu 
import seaborn as sns  # görselleştirme
import matplotlib.pyplot as plt  # görselleştirme

# görsel çıktılardaki uyarı mesajları için
import warnings 
warnings.filterwarnings('ignore')

# modellerin basarisini degerlendirmek icin
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# train ve test olarak bölmek için
from sklearn.model_selection import train_test_split

In [2]:
# temizlenmiş verinin wine değişkenine yüklenmesi
wine = pd.read_csv('redwine-clean.csv', index_col=0)
df = wine.copy()

In [3]:
# Modellerin olası bir hata vermesini önlemek için sütun isimlerindeki boşukları _ ile dolduruyoruz.
df.rename(columns = {'fixed acidity': 'fixed_acidity', 'volatile acidity': 'volatile_acidity',
                    'citric acid': 'citric_acid', 'residual sugar': 'residual_sugar',
                    'chlorides': 'chlorides', 'free sulfur dioxide': 'free_sulfur_dioxide',
                    'total sulfur dioxide': 'total_sulfur_dioxide'}, inplace = True)

In [4]:
# xgboost gibi bazı algoritmalar hedef değişkenin değerlerinin 0,1,2... 
# gibi olmasını ister dolayısıyla ona göre düzenleme yapıyoruz
df['quality'] = df['quality'].map({'yuksek': 2, 'orta': 1, 'zayif': 0})

In [5]:
# bağımlı ve bağımsız değişkenlerimizi belirliyoruz
y = df['quality']                    # bağımlı değişken
x = df.drop(['quality'], axis=1)     # bağımsız değişkenler

In [6]:
x.head(2)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8


In [7]:
y.head(2)

0    1
1    1
Name: quality, dtype: int64

In [7]:
# verinin train ve test olarak ikiye bölünmesi 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

Veri setinin %80'i train seti, kalan % 20'si de test seti olarak ayırdık.

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1279, 11)
(1279,)
(320, 11)
(320,)


In [9]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1


In [18]:
df['quality'].unique()

array([1, 2, 0], dtype=int64)

In [17]:
df['quality'].value_counts()

1    1319
2     217
0      63
Name: quality, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
# --------------------------------
from sklearn.decomposition import PCA
pca = PCA()
x_pca = pca.fit_transform(x)
pca_new = PCA(n_components=8)
x_new = pca_new.fit_transform(x)
# -----------------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_new, y, test_size = 0.2, random_state = 42)

# 📊 MODEL

## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression(solver='liblinear')
logr.fit(X_train,y_train)
y_pred_logr = logr.predict(X_test)
cm_logr = confusion_matrix(y_test,y_pred_logr)
logr_accuracy = accuracy_score(y_test, y_pred_logr)
print(cm_logr, '\n--------------------------\n', 'Logistic Regression Accuracy Score: ', logr_accuracy)

[[  0  11   0]
 [  0 252  10]
 [  0  37  10]] 
--------------------------
 Logistic Regression Accuracy Score:  0.81875


In [9]:
import optuna
from sklearn.metrics import accuracy_score

def objective(trial):
    optuna.logging.disable_default_handler()
    # Hiperparametre aralıklarını belirle
    C = trial.suggest_float("C", 1e-10, 1e10, log=True)
    solver = trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"])
    max_iter = trial.suggest_int("max_iter", 100, 1000)

    # Modeli oluştur
    logr = LogisticRegression(random_state=0, C=C, solver=solver, max_iter=max_iter)

    # Modeli eğit
    logr.fit(X_train, y_train)

    # Tahmin yap
    y_pred_logr = logr.predict(X_test)

    # Doğruluk skorunu döndür
    return accuracy_score(y_test, y_pred_logr)

# Optuna çalışmasını oluştur
study = optuna.create_study(direction="maximize")

# Optuna çalışmasını çalıştır
study.optimize(objective, n_trials=100)


# En iyi hiperparametreleri ve doğruluk skorunu yazdır
print("Best hyperparameters: ", study.best_params)
print("Best accuracy score: ", study.best_value)

[I 2023-06-10 22:01:26,619] A new study created in memory with name: no-name-8eda0b6d-bdf6-4405-b0f7-9cfc8efa0f2c


Best hyperparameters:  {'C': 760192413.5360472, 'solver': 'lbfgs', 'max_iter': 623}
Best accuracy score:  0.828125


## Support Vector Machine

In [10]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf')
svc.fit(X_train,y_train)
y_pred_svc = svc.predict(X_test)
cm_svc = confusion_matrix(y_test,y_pred_svc)
svc_accuracy = accuracy_score(y_test, y_pred_svc)
print(cm_svc, '\n--------------------------\n', 'Support Vector Classifier Accuracy Score: ', svc_accuracy)

[[  0  11   0]
 [  0 259   3]
 [  0  36  11]] 
--------------------------
 Support Vector Classifier Accuracy Score:  0.84375


## Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
cm_gnb = confusion_matrix(y_test,y_pred_gnb)
gnb_accuracy = accuracy_score(y_test, y_pred_gnb)
print(cm_gnb, '\n--------------------------\n', 'Naive Bayes Accuracy Score: ', gnb_accuracy)

[[  2   9   0]
 [  8 241  13]
 [  1  32  14]] 
--------------------------
 Naive Bayes Accuracy Score:  0.803125


## K Neighbors Classifier

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)
cm_knn = confusion_matrix(y_test,y_pred_knn)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(cm_knn, '\n--------------------------\n', 'KNN Accuracy Score: ', knn_accuracy)

[[  2   9   0]
 [  4 236  22]
 [  1  24  22]] 
--------------------------
 KNN Accuracy Score:  0.8125


## Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test,y_pred_dtc)
dtc_accuracy = accuracy_score(y_test, y_pred_dtc)
print(cm_dtc, '\n--------------------------\n', 'Decision Tree Accuracy Score: ', dtc_accuracy)

[[  1  10   0]
 [  5 242  15]
 [  0  20  27]] 
--------------------------
 Decision Tree Accuracy Score:  0.84375


## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred_rfc = rfc.predict(X_test)
cm_rfc = confusion_matrix(y_test,y_pred_rfc)
rfc_accuracy = accuracy_score(y_test, y_pred_rfc)
print(cm_rfc, '\n--------------------------\n', 'Random Forest Accuracy Score: ', rfc_accuracy)

[[  1  10   0]
 [  0 258   4]
 [  0  23  24]] 
--------------------------
 Random Forest Accuracy Score:  0.884375


## XGBoost

In [13]:
import xgboost as xgb
xgb = xgb.XGBClassifier(objective='multiclass:softmax', num_class=3)
xgb.fit(X_train,y_train)
y_pred_xgb = xgb.predict(X_test)
cm_xgb = confusion_matrix(y_test,y_pred_xgb)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(cm_xgb, '\n--------------------------\n', 'XGB Accuracy Score: ', xgb_accuracy)

[[  1  10   0]
 [  0 255   7]
 [  0  22  25]] 
--------------------------
 XGB Accuracy Score:  0.878125


In [37]:
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Verilerinizi X_train, y_train, X_test, y_test şeklinde ayarlayın
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'objective': 'multiclass:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1, 0.1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'seed': 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)
y_pred_xgb = best_model.predict(X_test)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print("Best params:", best_params)
print("Confusion matrix:", cm_xgb)
print("XGB Accuracy Score:", xgb_accuracy)

Parameters: { "colsample_bytree", "max_depth", "min_child_weight", "subsample" } are not used.

Best params: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 9, 'subsample': 0.8, 'colsample_bytree': 0.7, 'learning_rate': 0.09077420657550428, 'n_estimators': 800}
Confusion matrix: [[  0  11   0]
 [  2 252   8]
 [  0  17  30]]
XGB Accuracy Score: 0.88125


## LightGBM

In [14]:
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(objective='multiclass', num_class=3)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
lgbm_accuracy = accuracy_score(y_test, y_pred_lgbm)
print(cm_lgbm, '\n--------------------------\n', 'LightGBM Accuracy Score: ', lgbm_accuracy)

[[  0  11   0]
 [  0 255   7]
 [  0  23  24]] 
--------------------------
 LightGBM Accuracy Score:  0.871875


## CatBoost

In [15]:
import catboost as cb
cb = cb.CatBoostClassifier(loss_function='MultiClass', classes_count=3, verbose=0)
cb.fit(X_train, y_train)
y_pred_cb = cb.predict(X_test)
cm_cb = confusion_matrix(y_test, y_pred_cb)
cb_accuracy = accuracy_score(y_test, y_pred_cb)
print(cm_cb, '\n--------------------------\n', 'CatBoost Accuracy Score: ', cb_accuracy)

[[  1  10   0]
 [  0 256   6]
 [  0  18  29]] 
--------------------------
 CatBoost Accuracy Score:  0.89375


## AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostClassifier 
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)
y_pred_ab = ab.predict(X_test)
cm_ab = confusion_matrix(y_test, y_pred_ab)
ab_accuracy = accuracy_score(y_test, y_pred_ab)
print(cm_ab, '\n--------------------------\n', 'AdaBoost Accuracy Score: ', ab_accuracy)

[[  1  10   0]
 [  6 236  20]
 [  0  24  23]] 
--------------------------
 AdaBoost Accuracy Score:  0.8125
