# Importing data

In [None]:
import pandas as pd
import numpy as np
import sqlite3

In [None]:
db = sqlite3.connect('winequal.db')
cur = db.cursor()


cur.execute("ALTER TABLE red ADD COLUMN color INT DEFAULT 0")
cur.execute("ALTER TABLE white ADD COLUMN color INT DEFAULT 1")

cur.execute('SELECT * from red')
red_data = cur.fetchall()

db.commit()
db.close()

db = sqlite3.connect('winequal.db')
cur = db.cursor()

cur.execute('SELECT * from white')
white_data = cur.fetchall()

db.commit()
db.close()

 

In [None]:
df = pd.DataFrame(red_data, columns=red_data[0])
df = df.drop(df.index[0])
df.head()
red = df.rename(columns={0:'white?'});
red.reset_index(drop=True, inplace=True)

red2=pd.DataFrame()
for key in red.keys():
    red2[key]=pd.to_numeric(red[key])

red2.head()

In [None]:
df2 = pd.DataFrame(white_data, columns=white_data[0])
df2 = df2.drop(df2.index[0])
white = df2.rename(columns={1:'white?'});
white.reset_index(drop=True, inplace=True)

white2=pd.DataFrame()
for key in white.keys():
    white2[key]=pd.to_numeric(white[key])


In [None]:
comp_data = pd.concat([white2, red2])
comp_data.reset_index(drop=True, inplace=True)

# Exploring data

In [None]:
from bokeh.charts import Histogram
from bokeh.charts import Bar
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
from bokeh.charts import BoxPlot

output_notebook()

In [None]:

p1 =Histogram(comp_data, values='quality', color='white?',tools='pan, reset, box_zoom')
p1.xaxis.minor_tick_line_color=None


p2 = Bar(comp_data, values='quality', label='white?', agg='count',tools='pan, reset, box_zoom', legend=None)

p = gridplot([[p2,p1]], plot_width=400, plot_height=400)
show(p)
hold(p)

# In depth with each parameter

In [None]:
import itertools

In [None]:
red_y = np.array([])
white_y = np.array([])
red_x = np.array([])
white_x = np.array([])

plot_list = list()

key_subset = comp_data.keys().drop(['quality', 'white?'])

for key_sub in key_subset:
    red_y = np.array([])
    white_y = np.array([])
    red_x = np.array([])
    white_x = np.array([])
    for i,j in itertools.product(comp_data['white?'].unique(), comp_data['quality'].unique()):
        subset=comp_data[(comp_data['white?']==i)&(comp_data['quality']==j)]
        #print(subset.head())
        if i==1:
            mean=subset[key_sub].mean()
            white_y=np.append(white_y,mean)
            white_x=np.append(white_x,j)
        if i==0:
            mean=subset[key_sub].mean()
            red_y=np.append(red_y,mean)
            red_x=np.append(red_x,j)

    sorted_white= np.sort([white_x, white_y])

    sorted_red = np.sort([red_x, red_y])

    TOOLS = 'hover, reset, crosshair'
    p = figure(plot_width=300, plot_height=300, tools=TOOLS)
    
    p.line(sorted_white[0], sorted_white[1], color = 'green', legend = 'white')
    p.line(sorted_red[0], sorted_red[1], color='red', legend = 'red')
    p.axis[0].axis_label='quality'
    p.axis[1].axis_label= key_sub
    p.title.text = 'mean %s vs. quality' %(key_sub)
    p.legend.location = 'top_left'
    
    plot_list.append(p)

p3=gridplot([plot_list[0:3], plot_list[3:6]])
show(p3)

In [None]:
p4=gridplot([plot_list[6:9], plot_list[9:12]])
show(p4)

# Examining correlation

In [None]:
subset_red = comp_data[comp_data['white?']==0][key_subset]
corr_red = subset_red.corr()

subset_white = comp_data[comp_data['white?']==1][key_subset]
corr_white = subset_white.corr()


In [None]:
corr_red[(abs(corr_red)>=0.5)&(abs(corr_red)<1)]

In [None]:
corr_white[(abs(corr_white)>=0.5)&(abs(corr_white)<1)]

# Building Multiclass Classification Model for Quality

In [None]:
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import metrics

In [None]:
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(subset_red, red2['quality'], train_size = 0.75)

In [None]:
rgr = linear_model.LogisticRegressionCV(solver='lbfgs',multi_class='multinomial', scoring='accuracy')
rgr.fit(xtrain, ytrain)

In [None]:
pred=rgr.predict(xtest)

In [None]:
metrics.accuracy_score(pred, ytest)

In [None]:
from sklearn import svm

In [None]:
rgr2 = svm.SVC(kernel='rbf')
rgr2.fit(xtrain, ytrain)

In [None]:
pred = rgr2.predict(xtest)
metrics.accuracy_score(ytest, pred)

In [None]:
key_sub2 = xtrain.keys().drop(['residual sugar','fixed acidity','density'])

subset_xtrain = xtrain[key_sub2]
subset_xtest = xtest[key_sub2]

In [None]:
rgr3 = linear_model.LogisticRegressionCV(scoring='accuracy')
rgr3.fit(subset_xtrain, ytrain)

In [None]:
pred3 = rgr3.predict(subset_xtest)

In [None]:
metrics.accuracy_score(ytest, pred3)

## White wine modeling

In [None]:
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(subset_white, white2['quality'], train_size = 0.75)

In [None]:
rgr4 = linear_model.LogisticRegressionCV(solver='lbfgs',multi_class='multinomial')
rgr4.fit(xtrain, ytrain)

In [None]:
pred=rgr4.predict(xtest)

In [None]:
metrics.accuracy_score(ytest, pred)