In [159]:
import numpy as np
import pandas as pd
from sklearn import cross_validation, neighbors, preprocessing

In [160]:
import requests

In [161]:
# download dataset

#url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
#res = requests.get(url)

#with open('wine.data', 'wt') as data:
#    data.write(res.content)

In [162]:
data = pd.read_csv('wine.data', header=None)
names = ['class', 'alcohol', 'malic_acid', 'ash', 'alcalinity', 'magnesium',
        'total_phenols', 'flavanoids', 'nflavanoid_phenols', 'proanthocyanins',
        'color_intensity', 'hue', 'OD280', 'proline']
data.columns = names

In [163]:
data.head()

Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity,magnesium,total_phenols,flavanoids,nflavanoid_phenols,proanthocyanins,color_intensity,hue,OD280,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [164]:
train_data = data.drop('class', 1)
target = data['class']

In [165]:
cv = cross_validation.KFold(len(data), n_folds=5, shuffle=True, random_state=42)
scores = pd.DataFrame(columns = ['value'])

In [166]:
for k in range(1, 50):
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    for fold, (train, test) in enumerate(cv):
        clf.fit(train_data.loc[train, :], target[train])
        k_scores = cross_validation.cross_val_score(clf, train_data, target)
        scores.loc[k] = np.mean(k_scores)

In [167]:
scores['value'].idxmax()

1

In [168]:
scores['value'].max()

0.71992337164750964

In [169]:
file = open('neighbors.txt', 'wt')
file.write(str(scores['value'].idxmax()))
file.close()

In [170]:
file = open('accuracy.txt', 'wt')
file.write(str(scores['value'].max()))
file.close()

In [171]:
# data scaling
train_processed = pd.DataFrame(preprocessing.scale(train_data), columns = names[1:14])

In [172]:
train_processed

Unnamed: 0,alcohol,malic_acid,ash,alcalinity,magnesium,total_phenols,flavanoids,nflavanoid_phenols,proanthocyanins,color_intensity,hue,OD280,proline
0,1.518613,-0.562250,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.847920,1.013009
1,0.246290,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.269020,0.318304,0.788587,1.395148
3,1.691550,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.295700,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874
5,1.481555,-0.517367,0.305159,-1.289707,0.860705,1.562093,1.366128,-0.176095,0.664217,0.731870,0.406051,0.336606,2.239039
6,1.716255,-0.418624,0.305159,-1.469878,-0.262708,0.328298,0.492677,-0.498407,0.681738,0.083015,0.274431,1.367689,1.729520
7,1.308617,-0.167278,0.890014,-0.569023,1.492625,0.488531,0.482637,-0.417829,-0.597284,-0.003499,0.449924,1.367689,1.745442
8,2.259772,-0.625086,-0.718336,-1.650049,-0.192495,0.808997,0.954502,-0.578985,0.681738,0.061386,0.537671,0.336606,0.949319
9,1.061565,-0.885409,-0.352802,-1.049479,-0.122282,1.097417,1.125176,-1.143031,0.453967,0.935177,0.230557,1.325316,0.949319


In [181]:
cv_scaled = cross_validation.KFold(len(train_processed), n_folds=5, shuffle=True, random_state=42)
scores_scaled = pd.DataFrame(columns = ['value'])

In [190]:
for k in range(1, 50):
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    for fold, (train, test) in enumerate(cv_scaled):
        clf.fit(train_processed.loc[train,:], target[train])
        k_scores_sc = cross_validation.cross_val_score(clf_scaled, train_data, target)
        scores_scaled.loc[k] = np.mean(k_scores_sc)

In [191]:
scores_scaled['value'].idxmax()

1

In [184]:
scores_scaled['value'].max()

0.71992337164750964

In [185]:
scores_scaled

Unnamed: 0,value
1,0.719923
2,0.646935
3,0.658812
4,0.635441
5,0.663985
6,0.657854
7,0.657854
8,0.657663
9,0.668966
10,0.657663
