In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# null hypothesis- one categorical
# from scipy import stats

# F, p = stats.f_oneway(red_wine[red_wine['quality_label'] == 'low']['alcohol'],
# red_wine[red_wine['quality_label'] == 'medium']['alcohol'],
# red_wine[red_wine['quality_label'] == 'high']['alcohol'])
# print('ANOVA test for mean alcohol levels across wine samples with different quality ratings')
# print('F Statistic:', F, '\tp-value:', p)

## Data Preparatrion

In [3]:
red_wine_heatmap = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')


## dropping duplicates and reseting index

In [4]:

red_wine_heatmap.drop_duplicates(inplace=True)
red_wine_heatmap.reset_index(inplace=True)
red_wine_heatmap.drop(columns='index', inplace=True)


## add quality label

In [5]:
# we are creating a new column called "quality_label", we define a range and associate that range with a label
red_wine_heatmap['quality label'] = red_wine_heatmap['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
red_wine_heatmap['quality label'] = pd.Categorical(red_wine_heatmap['quality label'],
categories=['low', 'medium', 'high'])


# hypothesis check

In [6]:
from scipy import stats

In [7]:
def anova_tests2(df):  
    
    features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
    
    for feature in features:
        F, p = stats.f_oneway(
            df[df['quality label'] == 'low'][feature],
            df[df['quality label'] == 'medium'][feature],
            df[df['quality label'] == 'high'][feature]
        )
        if p > 0.05:
         print(f"ANOVA test for mean {feature} across wine samples with different quality ratings")
         print('F Statistic:', F, '\tp-value:', p)
        else:
         print("there is correlation")

In [8]:
anova_tests2(red_wine_heatmap)


there is correlation
there is correlation
there is correlation
ANOVA test for mean residual sugar across wine samples with different quality ratings
F Statistic: 0.018769916621439144 	p-value: 0.9814053962580443
there is correlation
there is correlation
there is correlation
there is correlation
ANOVA test for mean pH across wine samples with different quality ratings
F Statistic: 0.4933049630979343 	p-value: 0.6107145654590622
there is correlation
there is correlation


## encoding and removing unnecessary columns

In [9]:
# changing wine type to 0 and 1
wine1=red_wine_heatmap.copy()

In [10]:
from sklearn.preprocessing import OrdinalEncoder
# Create an instance of OrdinalEncoder

In [11]:
ordinal_encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
wine1['NumericalLabels'] = ordinal_encoder.fit_transform(wine1[['quality label']])

In [12]:
wine1.drop(columns='quality', inplace=True)

In [13]:
wine1.tail(100)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality label,NumericalLabels
1259,7.0,0.655,0.16,2.1,0.074,8.0,25.0,0.99606,3.37,0.55,9.7,low,0.0
1260,6.8,0.680,0.21,2.1,0.070,9.0,23.0,0.99546,3.38,0.60,10.3,low,0.0
1261,6.0,0.640,0.05,1.9,0.066,9.0,17.0,0.99496,3.52,0.78,10.6,low,0.0
1262,5.6,0.540,0.04,1.7,0.049,5.0,13.0,0.99420,3.72,0.58,11.4,low,0.0
1263,6.2,0.570,0.10,2.1,0.048,4.0,11.0,0.99448,3.44,0.76,10.8,medium,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,medium,1.0
1355,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,low,0.0
1356,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,medium,1.0
1357,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,low,0.0


In [14]:
wine1.drop(columns='quality label', inplace=True)

In [15]:
wine1.shape

(1359, 12)

In [16]:
#functions for finding outliers
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [17]:
#functions for removing outliers by column
def remove_outliers(column):
    mean_value = column.mean()
    std_dev = column.std()
    threshold = 3 * std_dev
    outliers_mask = (column - mean_value).abs() > threshold
    return column[~outliers_mask]



In [18]:
#importm library for dicidint the dataset into two 
from sklearn.model_selection import train_test_split

In [19]:
wine1.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,NumericalLabels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0.0


## splitting

In [20]:
wine2=wine1.copy()

In [21]:
wine2.drop(columns='NumericalLabels', inplace=True)

In [22]:
X=wine2

In [23]:

y = wine1['NumericalLabels']  # --> what you're trying to predict
  # --> the features we will keep to build our model





In [24]:
X.shape

(1359, 11)

In [25]:
y.shape

(1359,)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)

In [27]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1087, 11) (272, 11) (1087,) (272,)


# Logistic Regression Mathod

In [28]:
#building model Logistic Regression

from sklearn.linear_model import LogisticRegression

In [29]:
logistics_regression_model=LogisticRegression(random_state=52)

In [30]:
#fit the model
logistics_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# prediction
predictions=logistics_regression_model.predict(X_test)

In [32]:
predictions

array([1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1.,
       0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 1., 0., 0.

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
acc=accuracy_score(y_test, predictions)
acc

0.7463235294117647

In [35]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [36]:
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))

Accuracy: 0.7463235294117647

Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.72      0.75       136
         1.0       0.72      0.78      0.75       134
         2.0       0.00      0.00      0.00         2

    accuracy                           0.75       272
   macro avg       0.50      0.50      0.50       272
weighted avg       0.74      0.75      0.74       272


Confusion Matrix:
[[ 98  38   0]
 [ 29 105   0]
 [  0   2   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest model

In [37]:
from sklearn.ensemble import RandomForestClassifier


In [38]:
RF_model= RandomForestClassifier(n_estimators= 100, criterion='entropy', random_state=445)

In [39]:
RF_model.fit(X_train, y_train)

In [40]:
preds2= RF_model.predict(X_test)

In [41]:
preds2

array([1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 2., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 1.,
       1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 1., 1., 0.

In [42]:
acc=accuracy_score(y_test, preds2)
acc

0.75

In [43]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# result

In [44]:
accuracy = accuracy_score(y_test, preds2)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, preds2))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds2))

Accuracy: 0.75

Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.74      0.75       136
         1.0       0.73      0.78      0.75       134
         2.0       0.00      0.00      0.00         2

    accuracy                           0.75       272
   macro avg       0.50      0.50      0.50       272
weighted avg       0.75      0.75      0.75       272


Confusion Matrix:
[[100  36   0]
 [ 29 104   1]
 [  0   2   0]]


## KNN

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [46]:
scaler = StandardScaler()
X_train1 = scaler.fit_transform(X_train)
X_test1 = scaler.transform(X_test)

In [47]:
k = 3  # You can adjust this value based on your problem
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train1, y_train)

In [48]:
y_pred = knn_model.predict(X_test1)

# result

In [49]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6764705882352942

Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.60      0.66       136
         1.0       0.65      0.76      0.70       134
         2.0       0.00      0.00      0.00         2

    accuracy                           0.68       272
   macro avg       0.46      0.45      0.45       272
weighted avg       0.68      0.68      0.67       272


Confusion Matrix:
[[ 82  54   0]
 [ 31 102   1]
 [  0   2   0]]


## Support Vector Machines (SVM)

In [50]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [51]:
X4=wine2
y4 = wine1['NumericalLabels']  # --> what you're trying to predict
  # --> the features we will keep to build our model

In [52]:
from sklearn.model_selection import train_test_split
X4_train, X4_test, y4_train, y4_test = train_test_split(X4,y4,test_size=0.2, random_state=100)

# Standardize the features (important for SVM)
scaler = StandardScaler()
X4_train = scaler.fit_transform(X4_train)
X4_test = scaler.transform(X4_test)

In [53]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)

In [54]:


# Train the SVM classifier
svm_classifier.fit(X4_train, y4_train)

# Make predictions on the test set
prediction4 = svm_classifier.predict(X4_test)

## RESULT

In [55]:
# Evaluate the performance of the classifier
accuracy4 = accuracy_score(y4_test, prediction4)
report4 = classification_report(y4_test, prediction4)

# Print the results
print(f"Accuracy: {accuracy4:.2f}")
print("\nClassification Report:\n", report4)
print("\nConfusion Matrix:")
print(confusion_matrix(y4_test, prediction4))

Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.75      0.75       136
         1.0       0.74      0.75      0.75       134
         2.0       0.00      0.00      0.00         2

    accuracy                           0.75       272
   macro avg       0.50      0.50      0.50       272
weighted avg       0.74      0.75      0.74       272


Confusion Matrix:
[[102  34   0]
 [ 33 101   0]
 [  0   2   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
