### 单变量KNN

In [2]:
# 数据加载
import pandas as pd

features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews']
df = pd.read_csv('./listings.csv')[features]

print(df.shape)  # （测试语句）

df.head()  # （测试语句）

(3723, 8)


Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,4,1.0,1.0,2.0,$160.00,1,1125,0
1,6,3.0,3.0,3.0,$350.00,2,30,65
2,1,1.0,2.0,1.0,$50.00,2,1125,1
3,2,1.0,1.0,1.0,$95.00,1,1125,0
4,4,1.0,1.0,1.0,$50.00,7,1125,0


In [26]:
# 确定K（此处K=5）个近邻
import numpy as np

our_bedrooms_value = 3

df['bedrooms_distances'] = np.abs(df.bedrooms - our_bedrooms_value)
df.bedrooms_distances.value_counts().sort_index()  # 计算不同距离的数目并排序（测试语句）

0.0     190
1.0     555
2.0    2554
3.0     400
5.0       2
7.0       1
Name: bedrooms_distances, dtype: int64

In [27]:
df = df.sample(frac=1, random_state=0) # 数据洗牌
df = df.sort_values('bedrooms_distances') # 按bedrooms_distances升序排序dataframe
df.head()
df.price.head()  # 确定K=5个近邻

2732    $200.00
3683    $350.00
2010    $195.00
3355    $150.00
872     $600.00
Name: price, dtype: object

In [28]:
# 取K个近邻的均值作为预测结果
df['price'] = df.price.str.replace(r'\$|,','').astype(float)

mean_price = df.price.iloc[:5].mean()
mean_price
                            

299.0

### 模型评估

In [29]:
# 整个数据集的前75%作为训练集，后25%作为测试集
df.drop('bedrooms_distances', axis=1)

df_train = df.copy().iloc[:2792]
df_test = df.copy().iloc[2792:]

In [30]:
# 训练+预测
def predict_price(new_listing_value,feature_column):
    df_temp = df_train
    df_temp['bedrooms_distances'] = np.abs(df[feature_column] - new_listing_value)
    df_temp = df_temp.sort_values('bedrooms_distances')
    
    knn_5 = df_temp.price.iloc[:5]  # 取K=5个近邻
    predicted_price = knn_5.mean()  # 取K个近邻的均值作为预测值
    
    return predicted_price

In [31]:
df_test['predicted_price'] = df_test.bedrooms.apply(predict_price,feature_column='bedrooms')
# print(df_test)

# 求均方误差
df_test['squared_error'] = (df_test['predicted_price'] - df_test['price'])**2
mse = df_test['squared_error'].mean()
rmse = mse ** (1/2)
rmse

107.53357430392552

### 取不同的单个特征评估模型

In [32]:
for feature in ['accommodates','bedrooms','bathrooms','number_of_reviews']:
    df_test['predicted_price'] = df_test[feature].apply(predict_price, feature_column=feature)

    df_test['squared_error'] = (df_test['predicted_price'] - df_test['price'])**2
    mse = df_test['squared_error'].mean()
    rmse = mse ** (1/2)
    print("RMSE for the {} column: {}".format(feature,rmse))

RMSE for the accommodates column: 101.06532370839945
RMSE for the bedrooms column: 107.53357430392552
RMSE for the bathrooms column: 95.16420384158866
RMSE for the number_of_reviews column: 148.05411818629923


### 数据标准化处理

In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']

df = pd.read_csv('G:/master/ml/algorithms/KNN/data/listings.csv')[features]

df['price'] = df.price.str.replace(r'\$|,','').astype(float)

df = df.dropna()  # 删除有缺失值的数据

df[features] = StandardScaler().fit_transform(df[features])  # z-score标准化处理

normalized_df = df

print(normalized_df.shape)

normalized_df.head()

(3671, 8)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.40142,-0.249501,-0.439211,0.297386,0.081119,-0.341421,-0.016575,-0.516779
1,1.399466,2.129508,2.969551,1.141704,1.462622,-0.065047,-0.016606,1.706767
2,-1.095648,-0.249501,1.26517,-0.546933,-0.718699,-0.065047,-0.016575,-0.482571
3,-0.596625,-0.249501,-0.439211,-0.546933,-0.391501,-0.341421,-0.016575,-0.516779
4,0.40142,-0.249501,-0.439211,-0.546933,-0.718699,1.316824,-0.016575,-0.516779


### 多变量KNN

In [34]:
# 准备标准化后的训练集和测试集
norm_df_train = normalized_df.copy().iloc[:2792]  # 2792条训练样本
norm_df_test = normalized_df.copy().iloc[2792:]   # 879条测试样本

In [35]:
from scipy.spatial import distance

def predict_price_multivariate(new_listing_value,feature_columns):
    df_temp = norm_df_train
    
    # 针对每条测试样本点，计算该样本数据与训练集中每个训练样本的欧式距离
    df_temp['distance'] = distance.cdist(df_temp[feature_columns],[new_listing_value[feature_columns]])
    df_temp = df_temp.sort_values('distance')
    
    knn_5 = df_temp.price.iloc[:5]  # 取距离最近的K=5个近邻
    predicted_price = knn_5.mean()
    return(predicted_price)

# 指定训练特征
cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews']
norm_df_test['predicted_price'] = norm_df_test[cols].apply(predict_price_multivariate,feature_columns=cols,axis=1)    
norm_df_test['squared_error'] = (norm_df_test['predicted_price'] - norm_df_test['price'])**2
print(norm_df_test)

mse = norm_df_test['squared_error'].mean()
rmse = mse ** (1/2)
rmse

      accommodates  bedrooms  bathrooms      beds     price  minimum_nights  \
2839     -1.095648 -0.249501  -0.439211 -0.546933 -0.900476       -0.341421   
2840     -0.596625 -0.249501  -0.439211 -0.546933 -0.645988       -0.341421   
2841      0.401420 -0.249501  -0.439211 -0.546933  0.001137       -0.341421   
2842      0.900443 -0.249501   1.265170 -0.546933 -0.093387        1.316824   
2843     -0.596625 -1.439006  -0.439211 -0.546933 -0.427856       -0.341421   
2844     -0.596625 -0.249501  -0.439211 -0.546933 -0.391501       -0.341421   
2845      1.898489  2.129508   1.265170  1.141704  4.371051       -0.065047   
2846     -0.596625 -0.249501  -0.439211 -0.546933 -0.391501       -0.065047   
2847      0.401420 -0.249501  -0.439211  0.297386 -0.500567       -0.065047   
2848     -0.596625 -0.249501  -0.439211  0.297386 -0.347875        0.764075   
2849     -0.596625 -0.249501  -0.439211 -0.546933 -0.427856       -0.065047   
2850     -1.095648 -0.249501  -0.439211 -0.546933 -0

0.8365570295151261

### sklearn实现KNN

In [36]:
from sklearn.neighbors import KNeighborsRegressor
# 指定训练特征
cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews']
# 实例化模型对象
knn = KNeighborsRegressor(n_neighbors=5)  # K值默认是5
# 训练模型
knn.fit(norm_df_train[cols], norm_df_train['price'])  # 第一个参数是特征数据集，第二个参数是标签数据集
# 预测
predictions = knn.predict(norm_df_test[cols])
predictions

array([-6.43080039e-01, -3.39149244e-01, -3.04248100e-01, -9.77496649e-02,
       -3.07156528e-01, -3.92955174e-01,  1.08161816e+00, -3.92955174e-01,
        2.14959102e-02, -3.42057672e-01, -1.77151906e-03, -4.56940605e-01,
        1.88730558e-01, -1.86456739e-01, -4.30764747e-01, -1.19562880e-01,
       -3.18790243e-01, -4.10405746e-01,  1.13111413e-01,  1.15432887e+00,
       -1.79185667e-01, -4.45306890e-01, -3.33332386e-01, -1.47192952e-01,
       -1.86456739e-01, -2.38808455e-01, -2.62075884e-01,  3.50148349e-01,
       -3.71141959e-01, -3.69687745e-01, -1.85002525e-01, -9.92038792e-02,
       -2.03907311e-01, -4.43852676e-01, -5.22380249e-01, -1.44284523e-01,
       -8.75701646e-02, -3.11519171e-01, -4.45306890e-01, -2.89705957e-01,
       -5.99400923e-02, -3.37695029e-01, -2.08269954e-01,  8.54760722e-01,
       -3.58054030e-01, -3.11519171e-01, -1.90819382e-01, -5.12148063e-02,
        4.63577067e-01,  2.89071347e-01,  2.04285383e+00, -2.89705957e-01,
        2.94888204e-01, -

In [39]:
from sklearn.metrics import mean_squared_error

# 计算均方根误差RMSE
mse = mean_squared_error(norm_df_test['price'], predictions)
rmse = mse ** (1/2)
rmse

0.8243838530880285