In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [2]:
df = pd.read_csv('winequality-red.csv')
df.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
df.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [6]:
# Everything above the 0.75 quantile is considered to be good quality (quality=1) while everything below is considered being rather bad quality (quality=0)
border = np.quantile(df['quality'].values, 0.65)
df['quality'] = [1 if i>border else 0 for i in df['quality']]
df['quality'].head(20).to_frame()

Unnamed: 0,quality
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,1
8,1
9,0


# Train & Test Data

In [7]:
X_data = df.drop(['quality'], axis=1).values
y_data = df['quality'].values

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
standard_to = StandardScaler()
standard_to.fit(X_data)
X_data =standard_to.transform(X_data.astype(float))

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (1119, 11) (1119,)
Test set: (480, 11) (480,)


# Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

n = 20
mean_acc = np.zeros((n))

for i in range(0, n):
    clm_rf = RandomForestClassifier()
    clm_rf.fit(X_train, y_train)
    predictions = clm_rf.predict(X_test)
    acc = metrics.accuracy_score(y_test, predictions)
    mean_acc[i] = acc
    print(f'{i+1}/{n} done: {round(acc, 4)}')

print(f"Mean Accuracy after {n} runs: {mean_acc.mean()}")

1/20 done: 0.9
2/20 done: 0.9021
3/20 done: 0.9062
4/20 done: 0.9042
5/20 done: 0.9
6/20 done: 0.9
7/20 done: 0.9083
8/20 done: 0.9083
9/20 done: 0.9042
10/20 done: 0.9062
11/20 done: 0.9
12/20 done: 0.9083
13/20 done: 0.9062
14/20 done: 0.9083
15/20 done: 0.9062
16/20 done: 0.9083
17/20 done: 0.8979
18/20 done: 0.9062
19/20 done: 0.9
20/20 done: 0.9
Mean Accuracy after 20 runs: 0.9040624999999999


In [17]:
import pickle
# open a file, where you ant to store the data
file = open('random_forest_classifier_model.pkl', 'wb')

# dump information to that file
pickle.dump(clm_rf, file)

## finally we got accuracy of 90% 
#### I ll make end to end machine learning model for prediction of wine quality prediction