In [1]:
#classification using sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
red_wine = pd.read_csv(r"C:\Users\zhang\Downloads\wine+quality\winequality-white.csv",sep=';')
red_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
red_wine.info()
red_wine['quality'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [4]:
red_wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
#red_wine['quality'].unique()
bins = (2,6.5,9)
group_name = ['bad','good']
red_wine['quality'] = pd.cut(red_wine['quality'], bins=bins,labels=group_name)

In [6]:
label_quality = LabelEncoder()
red_wine['quality'] = label_quality.fit_transform(red_wine['quality'])
red_wine['quality'].value_counts()

quality
0    3838
1    1060
Name: count, dtype: int64

In [7]:
X = red_wine.drop('quality', axis=1)
y = red_wine['quality']

In [8]:
#Train and test splitting of data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

#applying standard scaling to get optimized results
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [9]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
prd_rfc = rfc.predict(X_test)

In [10]:
print(classification_report(y_test,prd_rfc))
print(confusion_matrix(y_test,prd_rfc))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       753
           1       0.85      0.65      0.74       227

    accuracy                           0.89       980
   macro avg       0.88      0.81      0.83       980
weighted avg       0.89      0.89      0.89       980

[[728  25]
 [ 80 147]]


In [11]:
clf=svm.SVC()
clf.fit(X_train,y_train)
prd_clf = clf.predict(X_test)

In [12]:
print(classification_report(y_test,prd_clf))
print(confusion_matrix(y_test,prd_clf))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89       753
           1       0.75      0.34      0.47       227

    accuracy                           0.82       980
   macro avg       0.79      0.65      0.68       980
weighted avg       0.81      0.82      0.79       980

[[728  25]
 [150  77]]


In [13]:
mlpc = MLPClassifier(hidden_layer_sizes=(11,11,11),max_iter=10000)
mlpc.fit(X_train,y_train)
prd_mlpc = mlpc.predict(X_test)

In [14]:
print(classification_report(y_test,prd_mlpc))
print(confusion_matrix(y_test,prd_mlpc))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       753
           1       0.62      0.46      0.53       227

    accuracy                           0.81       980
   macro avg       0.74      0.69      0.70       980
weighted avg       0.80      0.81      0.80       980

[[690  63]
 [123 104]]
