In [1]:
# This is practice for becoming familiar with the code
# I used the information found on 
# https://medium.com/themlblog/wine-quality-prediction-using-machine-learning-59c88a826789
# to run through the red wine data set

In [2]:
# Import the libraries
# pandas will be used to work with file formats like csv, xls, etc.
import pandas as pd
# numpy is used for making the mathematical calculations more accurate
import numpy as np
# sklearn (scikit-learn) will be used to import our classifier for prediction
# is used to split our dataset into training and testing data
from sklearn.model_selection import train_test_split
# is used to preprocess the data before fitting into predictor
from sklearn import preprocessing
# is used to import our decision tree classifier
from sklearn import tree

In [3]:
# Read in the csv file
wine_data=pd.read_csv("winequality-red.csv")
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# separate the features and labels into two different dataframes
y = wine_data.quality
X = wine_data.drop('quality', axis=1)


In [5]:
# split the dataset into test and train data
# we made the test data 20% of the original data.
# the remaining 80% is used for training
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)

In [6]:
# Print the first five elements of data we have split
print(X_train.head())

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
471             9.6             0.540         0.42             2.4      0.081   
548            12.4             0.350         0.49             2.6      0.079   
1360            9.2             0.540         0.31             2.3      0.112   
285             9.9             0.590         0.07             3.4      0.102   
1260            8.6             0.635         0.68             1.8      0.403   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
471                  25.0                  52.0  0.99700  3.20       0.71   
548                  27.0                  69.0  0.99940  3.12       0.75   
1360                 11.0                  38.0  0.99699  3.24       0.56   
285                  32.0                  71.0  1.00015  3.31       0.71   
1260                 19.0                  56.0  0.99632  3.02       1.15   

      alcohol  
471      11.4  
548      10.4  
13

In [7]:
# After obtaining the data we will be using, the next step
# is data normalization. It is part of pre-processing in
# which data is converted to fit in a range of -1 and 1.
X_train_scaled = preprocessing.scale(X_train)
print(X_train_scaled)

[[ 0.72628084  0.06320909  0.77470304 ... -0.72011474  0.2830714
   0.90763711]
 [ 2.32711314 -0.98752472  1.13852875 ... -1.24507135  0.51779307
  -0.04485837]
 [ 0.49759051  0.06320909  0.20297693 ... -0.45763644 -0.59713487
   0.43138937]
 ...
 [ 0.55476309 -0.15799802  0.46285244 ... -0.26077771  0.6351539
   0.33613983]
 [-0.30282564  1.27984825 -1.20035078 ... -0.32639728 -0.06901111
  -0.61635565]
 [ 1.75538731 -1.48524073  0.87865324 ... -1.24507135  0.92855599
  -1.09260339]]


In [8]:
# Now we train our algorithm so that it can predict the wine quality
clf=tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [9]:
# Check to see how efficiently the algorithm is predicting the wine quality.
confidence = clf.score(X_test, y_test)
print("\nThe confidence score:\n")
print(confidence * 100)


The confidence score:

59.375


In [10]:
# This score can change over time depending on the size of the dataset
# and shuffling of data when we divide the data into test and train,
# but you can always expect a range of +/-5 around the first result.

# Now that we have trained our classifier with features, we obtain
# the labels using predict() function.
y_pred = clf.predict(X_test)

In [11]:
# Our predicted information is stored in y_pred but it has far too many columns
# to compare it with the expected labels we stored in y_test . So we will just
# take first five entries of both, print them and compare them.
#converting the numpy array to list
x=np.array(y_pred).tolist()

#printing first 5 predictions
print("\nThe prediction:\n")
for i in range(0,5):
    print (x[i])
    
#printing first five expectations
print("\nThe expectation:\n")
print (y_test.head())


The prediction:

5
5
6
6
6

The expectation:

366     7
1346    5
1404    6
954     6
10      5
Name: quality, dtype: int64


In [12]:
# Don’t be intimidated, we did nothing magical above. We just converted y_pred from a numpy array to a list,
# so that we can compare with ease. Then we printed the first five elements of that list using for loop. And finally,
# we just printed the first five values that we were expecting, which were stored in y_test using head() function.
# The output looks something like this.

In [13]:
# Notice that almost all of the values in the prediction are similar to the expectations. The predictor was wrong
# twice, predicting 7 instead of 5 and 5 instead of 6. This gives us the accuracy of 60% for 5 examples. Of course, as the
# examples increases the accuracy goes down, precisely to 59.375%, but overall our predictor performs
# quite well, in-fact any accuracy % greater than 50% is considered as great.
# This is not a model that I will be using in my final project.