In [55]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings



In [45]:
df = pd.read_csv("winequality-white.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [46]:
x = df.drop('quality' , 1)

In [47]:
# y = df['quality']

y = df.pop('quality')


In [48]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
    

In [49]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [50]:
df[:3]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1


In [51]:
y.nunique()

7

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [57]:
def fit_predict(x_train,x_test,y_train,y_test,scaler,n_neighbors,metric='manhattan',weights='uniform'):
    train_scaled = scaler.fit_transform(x_train)
    test_scaled = scaler.transform(x_test)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors,metric=metric,weights=weights,n_jobs=4)
    knn.fit(train_scaled,y_train)
    y_pred = knn.predict(test_scaled)
    print(accuracy_score(y_test,y_pred))

In [58]:
for k in range(1,11):
    print("Accuracy score on KNN using n_neighbors = {0}: ".format(2**k),end=' ')
    fit_predict(x_train,x_test,y_train,y_test,StandardScaler(),2**k)

Accuracy score on KNN using n_neighbors = 2:  0.5775510204081633
Accuracy score on KNN using n_neighbors = 4:  0.5683673469387756
Accuracy score on KNN using n_neighbors = 8:  0.563265306122449
Accuracy score on KNN using n_neighbors = 16:  0.5530612244897959
Accuracy score on KNN using n_neighbors = 32:  0.5510204081632653
Accuracy score on KNN using n_neighbors = 64:  0.5469387755102041
Accuracy score on KNN using n_neighbors = 128:  0.5377551020408163
Accuracy score on KNN using n_neighbors = 256:  0.5193877551020408
Accuracy score on KNN using n_neighbors = 512:  0.5193877551020408
Accuracy score on KNN using n_neighbors = 1024:  0.47346938775510206


In [60]:
k =2 
for metric in ['euclidean' , 'cosine' , 'manhattan' , 'chebyshev']:
    print('Accuracy score on KNN using {} metric and {} neighbors: '.format(metric,k),end=' ')
    fit_predict(x_train,x_test,y_train,y_test,StandardScaler(),2,metric)

Accuracy score on KNN using euclidean metric and 2 neighbors:  0.5755102040816327
Accuracy score on KNN using cosine metric and 2 neighbors:  0.5683673469387756
Accuracy score on KNN using manhattan metric and 2 neighbors:  0.5775510204081633
Accuracy score on KNN using chebyshev metric and 2 neighbors:  0.573469387755102


In [64]:
for weights in ['uniform','distance']:
    print('Accuracy score on KNN using weights = {0} '.format(weights),end=' ')
    fit_predict(x_train,x_test,y_train,y_test,StandardScaler(),2,"chebyshev")

Accuracy score on KNN using weights = uniform  0.573469387755102
Accuracy score on KNN using weights = distance  0.573469387755102


Feature Engineering

In [66]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] * df['sulphates']
    return df

x_train = feat_eng(x_train)
x_test = feat_eng(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [69]:
print("Accuracy score after engg: ",end=' ')
fit_predict(x_train,x_test,y_train,y_test,StandardScaler(),2,'chebyshev',weights='distance')

Accuracy score after engg:  0.6469387755102041


In [71]:
original_score = 0.57346938775
best_score = 0.646938775510
improvement = np.abs(np.round(100*(original_score-best_score)/original_score,2))
print('overall improvement is {} % '.format(improvement))

overall improvement is 12.81 % 
