# K-近邻算法（KNN）

## 作业

#### 1、预测年收入是否大于50K美元

读取adult.txt文件，最后一列是年收入，并使用KNN算法训练模型，然后使用模型预测一个人的年收入是否大于50

获取年龄、教育程度、职位、每周工作时间作为机器学习数据  
获取薪水作为对应结果

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
data = pd.read_csv('data/adults.txt')

In [4]:
data.head(5)

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
data[data['salary'] == '>50K']

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
20,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K


In [9]:
train = data[['age','education','occupation','hours_per_week']].copy()
target = data['salary']

数据转换，将String类型数据转换为int

In [10]:
train['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [12]:
edu_unique = train['education'].unique()
def trans_edu(x):
    return np.argwhere(edu_unique == x)[0,0]

In [16]:
# 从ndarray中获取某一值在ndarray的索引位置
np.argwhere(edu_unique == '12th')

array([[15]])

In [17]:
train['education'] = train['education'].map(trans_edu)

In [18]:
# 岗位字符串的数值化
occ_unique = train['occupation'].unique()
def trans_occ(x):
    return np.argwhere(occ_unique == x)[0,0]

In [19]:
train['occupation'] = train['occupation'].map(trans_occ)

【知识点】map方法，进行数据转换

In [20]:
train.shape  # 样本数据

(32561, 4)

切片：训练数据和预测数据

In [21]:
# 处理训练集和测试集的拆分（随机）
from sklearn.model_selection import train_test_split

# train四个特征的样本集
# tagert 与特征样本集对应的目标数据集
# test_size 表示 测试数据集的占比
# random_state 表示随机种子
# 返回 四个数据集: 前两个是X的训练和测试集， 
#               后两个是目标y的训练和测试的目标集
X_train, X_test, y_train, y_test = train_test_split(train,target,
                                                    test_size=0.2,
                                                    random_state=1)

生成算法

In [50]:
X_train.shape

(26048, 4)

In [51]:
X_test.shape

(6513, 4)

第一步：训练数据

In [27]:
from sklearn.neighbors import KNeighborsClassifier

knnclf = KNeighborsClassifier(n_neighbors=25)

knnclf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=25, p=2,
           weights='uniform')

第二步：预测数据

In [34]:
edu_list = edu_unique.copy()
occ_list = occ_unique.copy()

for occ_index, occ_title in enumerate(occ_list):
    dancer = [[35,9,occ_index,40]]
    print(f'age=35, education=Doctorate,occ={occ_title},hpw=40',
          knnclf.predict(dancer))

age=35, education=Doctorate,occ=Adm-clerical,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Exec-managerial,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Handlers-cleaners,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Prof-specialty,hpw=40 ['>50K']
age=35, education=Doctorate,occ=Other-service,hpw=40 ['>50K']
age=35, education=Doctorate,occ=Sales,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Craft-repair,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Transport-moving,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Farming-fishing,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Machine-op-inspct,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Tech-support,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=?,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Protective-serv,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Armed-Forces,hpw=40 ['<=50K']
age=35, education=Doctorate,occ=Priv-house-serv,hpw=40 ['<=50K']


In [69]:
data['occupation'].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

#### 2、小麦种类预测

读取seeds.tsv文件，最后一列是小麦品种，其他列是小麦特征

In [36]:
seeds = pd.read_csv('data/seeds.tsv',sep='\t',header=None)
seeds

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.8710,5.763,3.312,2.2210,5.220,Kama
1,14.88,14.57,0.8811,5.554,3.333,1.0180,4.956,Kama
2,14.29,14.09,0.9050,5.291,3.337,2.6990,4.825,Kama
3,13.84,13.94,0.8955,5.324,3.379,2.2590,4.805,Kama
4,16.14,14.99,0.9034,5.658,3.562,1.3550,5.175,Kama
5,14.38,14.21,0.8951,5.386,3.312,2.4620,4.956,Kama
6,14.69,14.49,0.8799,5.563,3.259,3.5860,5.219,Kama
7,14.11,14.10,0.8911,5.420,3.302,2.7000,5.000,Kama
8,16.63,15.46,0.8747,6.053,3.465,2.0400,5.877,Kama
9,16.44,15.25,0.8880,5.884,3.505,1.9690,5.533,Kama


In [38]:
seeds.iloc[:,-1].unique()

array(['Kama', 'Rosa', 'Canadian'], dtype=object)

In [73]:
target = seeds.iloc[:,-1]
train = seeds.iloc[:,:-1]

In [76]:
X_train,X_test,y_train,y_test = train_test_split(train,target,test_size=0.2,random_state=0)

In [89]:
knnclf = KNeighborsClassifier(n_neighbors=7)
knnclf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [90]:
knnclf.score(X_test,y_test)

0.8809523809523809

In [93]:
x = [[8.26,1.84,0.8710,5.763,3.312,3.2210,6.220]]
knnclf.predict(x)

array(['Canadian'], dtype=object)

#### 3、改进约会网站的匹配效果

读取datingTestSet.txt文件，最后一列是喜欢程度。模型：根据前几列的信息，预测喜欢程度

In [97]:
dating = pd.read_csv('data/datingTestSet.txt',sep='\t',header=None)
# pd.read_csv('../data/datingTestSet2.txt',sep='\t',header=None)

In [98]:
train = dating.iloc[:,:-1]
target = dating.iloc[:,-1]

In [122]:
train

Unnamed: 0,0,1,2
0,40920,8.326976,0.953952
1,14488,7.153469,1.673904
2,26052,1.441871,0.805124
3,75136,13.147394,0.428964
4,38344,1.669788,0.134296
5,72993,10.141740,1.032955
6,35948,6.830792,1.213192
7,42666,13.276369,0.543880
8,67497,8.631577,0.749278
9,35483,12.273169,1.508053


In [117]:
X_train,X_test,y_train,y_test = train_test_split(train,target,random_state=3)

In [121]:
knnclf = KNeighborsClassifier(n_neighbors=9)
knnclf.fit(X_train,y_train)
knnclf.score(X_test,y_test)

0.808

In [131]:
# 验证数量级对训练结果的影响，第一列的量级特别大，所以KNN算法的可能预测结果主要有第一列数据影响

train1 = train[0].values.reshape(-1,1)

X_train1,X_test1,y_train1,y_test1 = train_test_split(train1,target,random_state=3)

In [132]:
knnclf1 = KNeighborsClassifier(n_neighbors=9)
knnclf1.fit(X_train1,y_train1)
knnclf1.score(X_test1,y_test1)

0.808

In [142]:
from sklearn.preprocessing import Normalizer

train0 = Normalizer().fit_transform(train[0].values.reshape(-1,1))

In [149]:
# 归一化处理
train_norml = train/train.sum()

In [151]:
X_train,X_test,y_train,y_test = train_test_split(train_norml,target,test_size=0.2,random_state=3)

In [156]:
knnclf = KNeighborsClassifier(n_neighbors=9)
knnclf.fit(X_train,y_train).score(X_test,y_test)

0.935

In [157]:
train1 = train_norml[0].values.reshape(-1,1)
X_train1,X_test1,y_train1,y_test1 = train_test_split(train1,target,test_size=0.2,random_state=3)

In [158]:
knnclf1 = KNeighborsClassifier(n_neighbors=9)
knnclf1.fit(X_train1,y_train1).score(X_test1,y_test1)

0.8