In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_fb_checkin_train.csv', index_col='row_id')
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_fb_checkin_test.csv', index_col='row_id')

### Data Munging

In [3]:
# Feature engineering using 'time' variable
def extract_time(df):
    df['time'] = pd.to_datetime(df_train['time'], unit='m')
    df['minute'] = df['time'].dt.minute
    df['hour'] = df['time'].dt.hour
    df['dayofyear'] = df['time'].dt.dayofyear
    df['weekofyear'] = df['time'].dt.weekofyear
    df['month'] = df['time'].dt.month
    df['year'] = df['time'].dt.year
    return df

train = extract_time(df_train)
test = extract_time(df_test)

In [4]:
# Split date into X and y
X_train = train.drop('place_id', 1)
X_test = test
y_train = train.place_id

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(29118021, 10)
(8607230, 10)
(29118021,)


### Classification

In [6]:
# Initialize and fit model
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors':range(5,50,10),
    'leaf_size':range(10,80,20),
    'weights':['uniform', 'distance'],
    'metric':['euclidean', 'manhatten', 'chebyshev', 'minkowski']
}
knn_grid = GridSearchCV(knn, param_grid, cv=5, scoring='average_precision')
%time knn_grid.fit(X_train, y_train)
print(knn_grid.best_score_)
print(knn_grid.best_params_)

In [None]:
# Predict
y_pred_proba = knn_grid.predict_proba(X_test)
df_knn = pd.DataFrame(y_pred, index=df_train.index, columns='place_id')
# df_knn.to_csv('/Users/dominicdebiaso/Desktop/kaggle_facebook_checkins_knn.csv')