In [1]:
import pandas as pd
import numpy as np

# 1、获取数据
data = pd.read_csv("FBlocation/train.csv")

In [2]:
# 2、基本的数据处理
# 1)缩小数据范围
data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")

In [3]:
# 2)处理时间特征
time_value = pd.to_datetime(data["time"], unit="s")

In [4]:
date = pd.DatetimeIndex(time_value)

In [5]:
data['day'] = date.day
data['weekday'] = date.weekday
data['hour'] = date.hour

In [6]:
# 3)过滤掉签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]

In [7]:
data_final = data[data['place_id'].isin(place_count[place_count > 10].index.values)]

In [8]:
# 筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [9]:
np.shape(x)

(78730, 6)

In [10]:
np.shape(y)

(78730,)

In [11]:
# 数据集划分
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [13]:
# 3、特征工程
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [14]:
# 4、KNN算法预估器
from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier()
# 加入网格搜索与交叉验证
from sklearn.model_selection import GridSearchCV
# 参数准备
param_dict = {"n_neighbors": [4, 5, 6, 7]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

In [15]:
# 5、模型评估
# 方法1：直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值：\n", y_test == y_predict)
# 方法2：计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)
# 打印网格搜索和交叉验证内容
print("最佳参数：\n", estimator.best_params_)
print("最佳结果：\n", estimator.best_score_)
print("最佳估计器：\n", estimator.best_estimator_)
print("交叉验证结果：\n", estimator.cv_results_)

y_predict:
 [2074133146 8904900131 1272823671 ... 5111412226 7396159924 8032468532]
直接比对真实值和预测值：
 23527582    False
9111359      True
19894450    False
14637092     True
20747779     True
            ...  
3683993      True
2454746     False
5880227      True
8365626     False
25163       False
Name: place_id, Length: 19683, dtype: bool
准确率为：
 0.37717827566935935
最佳参数：
 {'n_neighbors': 6}
最佳结果：
 0.343641495504099
最佳估计器：
 KNeighborsClassifier(n_neighbors=6)
交叉验证结果：
 {'mean_fit_time': array([0.08597151, 0.10704724, 0.07978646, 0.07877501]), 'std_fit_time': array([0.00914471, 0.02693808, 0.00373198, 0.00637084]), 'mean_score_time': array([0.92778691, 1.33356007, 0.96690734, 0.91444675]), 'std_score_time': array([0.23667145, 0.27106985, 0.05542792, 0.05783701]), 'param_n_neighbors': masked_array(data=[4, 5, 6, 7],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 4}, {'n_neighbors': 5}, {'n_neighbors': 6}, {'n_neigh