In [1]:
# - 加载数据集
# - 数据的基本处理
#   - 缩小数据规模
#   - 选择有效的时间特征
#   - 去掉签到位置少的地方
#   - 确定特征值和目标值
#   - 分割数据集
# - 特征工程(特征预处理-标准化)
# - 机器学习(模型训练) KNN + GridSearchCV
# - 模型评估
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


In [2]:
# - 加载数据集
data = pd.read_csv('./data/FBlocation/train.csv')

In [12]:
# - 数据的基本处理
#   - 缩小数据规模
facebook_data = data.query('x>2.0 & x<2.5 & y>2.0 & y<2.5')
#   - 选择有效的时间特征
# 1. 把单位为s时间转换为date_time
date_time = pd.to_datetime(facebook_data['time'], unit='s')
# 2. 把date_time转换为DatetimeIndex
date_time = pd.DatetimeIndex(date_time)
# 3. 添加时间特征的列: 天, 周, 小时
facebook_data['day'] = date_time.day
facebook_data['weekday'] = date_time.weekday
facebook_data['hour'] = date_time.hour
#   - 去掉签到位置少的地方
# facebook_data
# 1. 统计每一个位置签到数量
place_counts = facebook_data.groupby('place_id').count()
# 2. 选择签到位置大于3的地方
place_counts = place_counts[place_counts['row_id']>3]
# 3. 从数据中 去取 签到位置大于3的
facebook_data = facebook_data[facebook_data['place_id'].isin(place_counts.index)]
# facebook_data
#   - 确定特征值和目标值
x = facebook_data[['x', 'y', 'accuracy', 'day', 'weekday', 'hour']]
y = facebook_data['place_id']

#   - 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [19]:
# x_train
# x_test
# y_train
# y_test
# - 特征工程(特征预处理-标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [20]:
# - 机器学习(模型训练) KNN + GridSearchCV
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
estimator = GridSearchCV(estimator, param_grid=param_grid, cv=5)
# 模型训练
estimator.fit(x_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [3, 5, 7, 9, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
# - 模型评估
y_pre = estimator.predict(x_test)
# print(y_pre == y_test)
# 计算准确率
score = estimator.score(x_test, y_test)
print('准确率', score)

# 网格搜索交叉验证的结果
print('交叉最好的分数', estimator.best_score_)
print('交叉验证最好模型', estimator.best_estimator_)
print('交叉验证的结果', estimator.cv_results_)


准确率 0.36077959576515883
交叉最好的分数 0.3537868162692847
交叉验证最好模型 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
交叉验证的结果 {'mean_fit_time': array([0.03345513, 0.032127  , 0.03146172, 0.03155689, 0.02996039]), 'std_fit_time': array([0.00116905, 0.00225912, 0.00091653, 0.00170547, 0.00061373]), 'mean_score_time': array([0.34341044, 0.39604506, 0.44030552, 0.48912234, 0.52861471]), 'std_score_time': array([0.01029373, 0.01648934, 0.01115806, 0.01483379, 0.00949124]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9, 11],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}, {'n_neighbors': 9}, {'n_neighbors': 11}], 'split0_test_score': array([0.33038289, 0.34281452, 0.33843859, 0.33814023, 0.3339632 ]), 'split1_test_score': array([0.33848954, 0.34880194, 0

In [None]:
# - 加载数据集
# - 数据的基本处理
#   - 缩小数据规模
#   - 选择有效的时间特征
#   - 去掉签到位置少的地方
#   - 确定特征值和目标值
#   - 分割数据集
# - 特征工程(特征预处理-标准化)
# - 机器学习(模型训练) KNN + GridSearchCV
# - 模型评估