# KNN

* 加载数据集

In [1]:
%matplotlib inline
import os
import numpy as np
from scipy import stats
import pandas as pd
import sklearn.model_selection as cross_validation  # Use sklearn.cross_validation in old version
import matplotlib.pyplot as plt

# os.chdir('Q:/data')
pd.set_option('display.max_columns', None)

In [4]:
orgData = pd.read_csv('date_data2.csv')
orgData.income_rank = orgData.income_rank.astype('category')
orgData.describe(include='all')

Unnamed: 0,income,attractive,assets,edueduclass,Dated,income_rank,attractive_rank,assets_rank
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
unique,,,,,,4.0,,
top,,,,,,3.0,,
freq,,,,,,28.0,,
mean,9010.0,50.5,96.0063,3.71,0.5,,1.56,1.51
std,5832.675288,28.810948,91.082226,1.225116,0.502519,,1.103896,1.123621
min,3000.0,1.0,3.7284,1.0,0.0,,0.0,0.0
25%,5000.0,28.0,31.665269,3.0,0.0,,1.0,0.75
50%,7500.0,51.0,70.746924,4.0,0.5,,2.0,2.0
75%,11500.0,68.875,131.481061,4.0,1.0,,2.25,2.25


In [5]:
orgData.dtypes

income                int64
attractive          float64
assets              float64
edueduclass           int64
Dated                 int64
income_rank        category
attractive_rank       int64
assets_rank           int64
dtype: object

* 选取自变量

In [8]:
X = orgData.iloc[:, :4]
Y = orgData[['Dated']]
X.head()

Unnamed: 0,income,attractive,assets,edueduclass
0,3000,9.0,5.145476,1
1,3000,14.5,40.643781,4
2,3000,6.0,5.145476,1
3,3000,1.0,7.067434,1
4,3500,14.5,3.7284,2


* 极值标准化

In [4]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_scaled[1:5]

array([[ 0.        ,  0.13705584,  0.07649535,  0.6       ],
       [ 0.        ,  0.05076142,  0.00293644,  0.        ],
       [ 0.        ,  0.        ,  0.00691908,  0.        ],
       [ 0.01612903,  0.13705584,  0.        ,  0.2       ]])

* 划分训练集和测试集

In [5]:
train_data, test_data, train_target, test_target = cross_validation.train_test_split(
    X_scaled, Y, test_size=0.2, train_size=0.8, random_state=123)   #划分训练集和测试集

上述过程有没有问题？

* 建模

In [6]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)  # 默认欧氏距离
model.fit(train_data, train_target.values.flatten())
test_est = model.predict(test_data)

* 验证

In [7]:
import sklearn.metrics as metrics

print(metrics.confusion_matrix(test_target, test_est, labels=[0, 1]))  # 混淆矩阵
print(metrics.classification_report(test_target, test_est))

[[ 8  1]
 [ 1 10]]
             precision    recall  f1-score   support

          0       0.89      0.89      0.89         9
          1       0.91      0.91      0.91        11

avg / total       0.90      0.90      0.90        20



In [8]:
model.score(test_data, test_target)

0.90000000000000002

* 选择k值

In [9]:
for k in range(1, 15):
    k_model = KNeighborsClassifier(n_neighbors=k)
    k_model.fit(train_data, train_target.values.flatten())
    score = k_model.score(test_data, test_target)
    print(k, '\t', score)

1 	 0.9
2 	 0.85
3 	 0.9
4 	 0.9
5 	 0.9
6 	 0.85
7 	 0.9
8 	 0.85
9 	 0.9
10 	 0.85
11 	 0.9
12 	 0.95
13 	 1.0
14 	 1.0


* 交叉验证选择k值

In [10]:
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold

n_samples = len(train_data)
kf = KFold(n_splits=3) # Set parameter to 'n=n_samples, n_folds=3' in old version
grid = ParameterGrid({'n_neighbors':[range(1,15),]})
estimator = KNeighborsClassifier()
# kfold = KFold()
gridSearchCV = GridSearchCV(estimator, grid, cv=3)
gridSearchCV.fit(train_data, train_target.values.flatten())
# gridSearchCV.grid_scores_  # Use it in old version
gridSearchCV.cv_results_

{'mean_fit_time': array([ 0.00033339,  0.00033339,  0.00066678,  0.00066678,  0.00066678,
         0.0006667 ,  0.00033339,  0.00066662,  0.00066662,  0.        ,
         0.        ,  0.00033331,  0.00033339,  0.00066678]),
 'mean_score_time': array([ 0.00100001,  0.00100009,  0.00066662,  0.        ,  0.        ,
         0.00033339,  0.00066678,  0.        ,  0.00033339,  0.00100001,
         0.00100001,  0.        ,  0.0006667 ,  0.00033331]),
 'mean_test_score': array([ 0.8375,  0.825 ,  0.8625,  0.825 ,  0.875 ,  0.825 ,  0.8375,
         0.8125,  0.8   ,  0.8   ,  0.7875,  0.7875,  0.8125,  0.7875]),
 'mean_train_score': array([ 1.        ,  0.94397857,  0.93745632,  0.91276497,  0.91905427,
         0.92522711,  0.91893781,  0.88131843,  0.87491265,  0.88131843,
         0.89378057,  0.86873981,  0.86873981,  0.86256697]),
 'param_n_neighbors': masked_array(data = [1 2 3 4 5 6 7 8 9 10 11 12 13 14],
              mask = [False False False False False False False False False Fal

In [11]:
gridSearchCV.best_params_

{'n_neighbors': 5}

In [12]:
best = gridSearchCV.best_estimator_ 
best.score(test_data, test_target)

0.90000000000000002

练习：试一试哪些参数会影响结果