In [18]:
import pandas as pd
import numpy as np
import math
import random
import scipy
import statistics
import statsmodels.formula.api as smf
import sklearn.linear_model as lm

import keras

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Global Variables 
train_test_split = .8 #fraction of data to use for training

!ls

'Final Project.ipynb'	  HRDataset_v13.csv   winequality-red.csv
 Final_Project_v2.ipynb   ph_v1_days.csv


In [19]:
pdth = pd.read_csv("winequality-red.csv")
print(pdth.head())
print("Total Availible Keys: ", len(pdth.keys()))

repl_nan = lambda x: list(map(lambda xx: 0 if math.isnan(xx) else xx, x))

qualtities = list(map(lambda x: int(x), repl_nan(pdth['quality'])))

normalizers = {}
for k in pdth.keys():
    if k in ["quality"]:
        continue
    normalizers[k]=statistics.mean(repl_nan(list(map(lambda x: float(x), pdth[k]))))

xs = []
for k in pdth.keys():
    if k in ["quality"]:
        continue
    xs.append(repl_nan(list(map(lambda x: float(x)/normalizers[k], pdth[k]))))
    
# Put the data into numpy arrays
xs=np.transpose(np.array(xs))
ys=np.array(qualtities)

#Xshape = (num_samples, features), Yshape = (num_samples,)
print("\nPre Split Shapes:")
print("X shape = {}, Y shape = {}".format(xs.shape, ys.shape))

#Create a training and testing split

#Zip the xs and ys together
xsys = list(zip(xs, ys))

#shuffle the combined list
random.shuffle(xsys)

#Unzip the shuffled list
xs, ys = zip(*xsys)

splt_ndx = int(len(xs)*train_test_split)

xs_tr = np.array(xs[0:splt_ndx]); xs_ts = np.array(xs[splt_ndx:])
ys_tr = np.array(ys[0:splt_ndx]); ys_ts = np.array(ys[splt_ndx:])

#Print the final shapes
print("\nFinal Shapes:")
print("Shapes: X train = {}, Y train = {}".format(xs_tr.shape, ys_tr.shape))
print("        X test  = {},  Y test  = {}".format(xs_ts.shape, ys_ts.shape))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [4]:
model_lr = lm.LogisticRegression(solver='liblinear')
model_lr.fit(xs_tr, ys_tr)

get_acc_lr = lambda x, y: len(np.where(model_lr.predict(x)==y)[0])/len(y)

print("Training Accuracy = ", get_acc_lr(xs_tr, ys_tr))
print("Testing  Accuracy = ", get_acc_lr(xs_ts, ys_ts))

print(model_lr.predict(xs_tr)[0:100])
print(ys_tr[0:100])

Training Accuracy =  0.5949960906958561
Testing  Accuracy =  0.559375
[5 6 5 6 6 6 6 6 5 5 5 6 5 5 6 5 5 6 6 5 6 7 5 5 5 6 6 5 6 5 6 5 6 6 6 6 5
 6 5 5 6 5 5 6 6 5 5 6 5 6 6 5 6 5 6 5 5 6 5 5 6 6 6 5 5 5 6 6 5 6 6 6 6 5
 7 6 5 5 5 6 5 6 5 5 5 6 5 6 6 6 6 5 5 5 5 6 5 5 5 5]
[5 6 6 5 5 6 6 7 6 5 6 6 5 6 5 4 6 6 6 6 7 7 5 5 4 7 6 5 6 4 6 6 7 5 6 6 6
 6 7 5 6 6 5 8 7 5 3 7 4 6 6 5 5 6 6 5 5 6 5 5 6 7 6 4 6 6 7 7 5 6 6 6 6 5
 7 6 5 5 5 6 5 6 5 4 5 4 5 6 7 5 6 5 5 5 6 5 5 5 5 5]


In [33]:
from keras.models import Sequential
from keras.layers import Dense, Dropout


# Make one hot encodings of the ys 
ys_tr_oh = np.zeros((len(ys_tr), 10))
ys_ts_oh = np.zeros((len(ys_ts), 10))

for i, n in enumerate(ys_tr):
    ys_tr_oh[i, n-1]=1
    
for i, n in enumerate(ys_ts):
    ys_ts_oh[i, n-1]=1

hddn_units = 128
model_ml = Sequential([
    Dense(hddn_units, activation='relu'), 
    Dropout(.5),
    Dense(hddn_units, activation='relu'), 
    Dropout(.5),
    Dense(64, activation='relu'), 
    Dropout(.25),
    Dense(10, activation='softmax')
])
model_ml.compile(optimizer='adam', loss='categorical_crossentropy')

model_ml.fit(xs_tr, ys_tr_oh, epochs=500, verbose=0)


def get_acc_ml(x, y, allow_diff=0):
    preds = model_ml.predict(x)
    crr=0
    for xx, yy in zip(preds, y):
        if (np.argmax(xx)-np.argmax(yy))<=allow_diff:
            crr+=1
        
    
    
    return crr/len(y)

print("Training Accuracy = ", get_acc_ml(xs_tr, ys_tr_oh))
print("Testing  Accuracy = ", get_acc_ml(xs_ts, ys_ts_oh))

print("Training Accuracy (allowed 1 diffrence) = ", get_acc_ml(xs_tr, ys_tr_oh, allow_diff=1))
print("Testing  Accuracy (allowed 1 diffrence) = ", get_acc_ml(xs_ts, ys_ts_oh, allow_diff=1))



Training Accuracy =  0.890539483971853
Testing  Accuracy =  0.85
Training Accuracy (allowed 1 diffrence) =  0.9945269741985927
Testing  Accuracy (allowed 1 diffrence) =  0.975


In [39]:
print("Training Accuracy (allowed 1 diffrence) = ", get_acc_ml(xs_tr, ys_tr_oh, allow_diff=3))
print("Testing  Accuracy (allowed 1 diffrence) = ", get_acc_ml(xs_ts, ys_ts_oh, allow_diff=3))

Training Accuracy (allowed 1 diffrence) =  1.0
Testing  Accuracy (allowed 1 diffrence) =  1.0
