-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Tesei
committed
Jun 17, 2016
1 parent
1e19e33
commit e8b5f1c
Showing
5 changed files
with
589 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.linear_model import SGDClassifier | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn import tree | ||
import itertools | ||
|
||
def prepare_data(df, n_cell_x, n_cell_y): | ||
""" | ||
Feature engineering and computation of the grid. | ||
""" | ||
# Creating the grid | ||
size_x = 10. / n_cell_x | ||
size_y = 10. / n_cell_y | ||
eps = 0.00001 | ||
xs = np.where(df.x.values < eps, 0, df.x.values - eps) | ||
ys = np.where(df.y.values < eps, 0, df.y.values - eps) | ||
pos_x = (xs / size_x).astype(np.int) | ||
pos_y = (ys / size_y).astype(np.int) | ||
df['grid_cell'] = pos_y * n_cell_x + pos_x | ||
|
||
return df | ||
if __name__ == '__main__': | ||
print('Loading data ...') | ||
df_train = pd.read_csv('C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\train.csv', dtype={ | ||
'x': np.float32, | ||
'y': np.float32, | ||
'accuracy': np.int16, | ||
'time': np.int64, | ||
'place_id': np.int64}, | ||
index_col=0) | ||
df_test = pd.read_csv('C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\test.csv', dtype={ | ||
'x': np.float32, | ||
'y': np.float32, | ||
'accuracy': np.int16, | ||
'time': np.int64}, | ||
index_col=0) | ||
df_sub = pd.read_csv('C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\sample_submission.csv', index_col=0) | ||
|
||
df_train = df_train.drop(['accuracy'], 1) | ||
df_test = df_test.drop(['accuracy'], 1) | ||
|
||
n_cell_x = 10 | ||
n_cell_y = 30 | ||
|
||
assert df_train[df_train.place_id.values < 0].shape[0] == 0 | ||
|
||
print('Preparing train data') | ||
df_train = prepare_data(df_train, n_cell_x, n_cell_y) | ||
|
||
print('Preparing test data') | ||
df_test = prepare_data(df_test, n_cell_x, n_cell_y) | ||
|
||
## | ||
grid_id = 180 | ||
th = 2 | ||
g_dict = {'delta_time':[60, 3 * 60, 24 * 40, 2 * 24 * 60, 4 * 24 * 60, 7 * 24 * 60, 15 * 24 * 60, 30 * 24 * 60, 45 * 24 * 60], | ||
'k':[5,10,15,20,25,30,35]} | ||
cv_grid = pd.DataFrame(list(itertools.product(*g_dict.values())), columns=g_dict.keys()) | ||
cv_grid['MAP@3'] = -1 | ||
|
||
|
||
# Working on df_train | ||
df_cell_train = df_train.loc[df_train.grid_cell == grid_id] | ||
place_counts = df_cell_train.place_id.value_counts() | ||
mask = (place_counts[df_cell_train.place_id.values] >= th).values | ||
df_cell_train = df_cell_train.loc[mask] | ||
|
||
# Working on df_test | ||
df_cell_test = df_test.loc[df_test.grid_cell == grid_id] | ||
row_ids = df_cell_test.index | ||
|
||
## | ||
for index, row in cv_grid.iterrows(): | ||
print(str(index) + "-delta_time:" + str(row['delta_time']) + "-k:" + str(row['k'])) | ||
fmat = df_cell_train | ||
fmat['win'] = (fmat.time.values // row['delta_time']) | ||
wins = np.sort(np.unique(fmat.win.values)) | ||
for x in range(wins.shape[0]): | ||
fmatt = fmat.loc[fmat.win == wins[x]] | ||
print(str(fmatt)) | ||
print("**********************************") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# coding: utf-8 | ||
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons' | ||
|
||
'''Partially based on grid_plus_classifier script: | ||
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier | ||
''' | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.neighbors import KNeighborsClassifier | ||
|
||
|
||
def prepare_data(df, n_cell_x, n_cell_y): | ||
""" | ||
Feature engineering and computation of the grid. | ||
""" | ||
# Creating the grid | ||
size_x = 10. / n_cell_x | ||
size_y = 10. / n_cell_y | ||
eps = 0.00001 | ||
xs = np.where(df.x.values < eps, 0, df.x.values - eps) | ||
ys = np.where(df.y.values < eps, 0, df.y.values - eps) | ||
pos_x = (xs / size_x).astype(np.int) | ||
pos_y = (ys / size_y).astype(np.int) | ||
df['grid_cell'] = pos_y * n_cell_x + pos_x | ||
|
||
# Feature engineering | ||
#fw = [500, 1000, 4, 3, 1. / 22., 2, 10] # feature weights (black magic here) | ||
fw = [14.689474, 480.326019, 4.083116, 3.251665, 4.081971, 3.763649, 2.40133] | ||
df.x = df.x.values * fw[0] | ||
df.y = df.y.values * fw[1] | ||
initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') | ||
d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') | ||
for mn in df.time.values) | ||
df['hour'] = d_times.hour * fw[2] | ||
df['weekday'] = d_times.weekday * fw[3] | ||
df['day'] = (d_times.dayofyear * fw[4]).astype(int) | ||
df['month'] = d_times.month * fw[5] | ||
df['year'] = (d_times.year - 2013) * fw[6] | ||
|
||
df = df.drop(['time'], axis=1) | ||
return df | ||
|
||
|
||
def process_one_cell(df_train, df_test, grid_id, th): | ||
""" | ||
Classification inside one grid cell. | ||
""" | ||
# Working on df_train | ||
df_cell_train = df_train.loc[df_train.grid_cell == grid_id] | ||
place_counts = df_cell_train.place_id.value_counts() | ||
mask = (place_counts[df_cell_train.place_id.values] >= th).values | ||
df_cell_train = df_cell_train.loc[mask] | ||
|
||
# Working on df_test | ||
df_cell_test = df_test.loc[df_test.grid_cell == grid_id] | ||
row_ids = df_cell_test.index | ||
|
||
# Preparing data | ||
le = LabelEncoder() | ||
y = le.fit_transform(df_cell_train.place_id.values) | ||
X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) | ||
X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) | ||
|
||
# Applying the classifier | ||
clf = KNeighborsClassifier(n_neighbors=25, weights='distance', | ||
metric='manhattan') | ||
clf.fit(X, y) | ||
y_pred = clf.predict_proba(X_test) | ||
pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) | ||
assert pred_labels[pred_labels < 0].shape[0] == 0 | ||
return pred_labels, row_ids | ||
|
||
|
||
def process_grid(df_train, df_test, th, n_cells): | ||
""" | ||
Iterates over all grid cells, aggregates the results and makes the | ||
submission. | ||
""" | ||
preds = np.zeros((df_test.shape[0], 3), dtype=np.int64) | ||
|
||
for g_id in range(n_cells): | ||
if g_id % 100 == 0: | ||
print('iter: %s' % (g_id)) | ||
|
||
# Applying classifier to one grid cell | ||
pred_labels, row_ids = process_one_cell(df_train, df_test, g_id, th) | ||
assert pred_labels[pred_labels < 0].shape[0] == 0 | ||
|
||
# Updating predictions | ||
preds[row_ids] = pred_labels | ||
|
||
print('Generating submission file ...') | ||
# Auxiliary dataframe with the 3 best predictions for each sample | ||
df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3']) | ||
|
||
# Concatenating the 3 predictions for each sample | ||
ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ') | ||
|
||
# Writting to csv | ||
ds_sub.name = 'place_id' | ||
ds_sub.to_csv('sub_knn.csv', index=True, header=True, index_label='row_id') | ||
|
||
|
||
if __name__ == '__main__': | ||
""" | ||
""" | ||
print('Loading data ...') | ||
# df_train = pd.read_csv('C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\train.csv', | ||
# usecols=['row_id', 'x', 'y', 'time', 'place_id'], | ||
# index_col=0) | ||
# df_test = pd.read_csv('C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\test.csv', | ||
# usecols=['row_id', 'x', 'y', 'time'], | ||
# index_col=0) | ||
|
||
df_train = pd.read_csv( | ||
'C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\train.csv', | ||
dtype={ | ||
'x': np.float32, | ||
'y': np.float32, | ||
'accuracy': np.int16, | ||
'time': np.int64, | ||
'place_id': np.int64}, | ||
index_col=0) | ||
df_test = pd.read_csv( | ||
'C:\\Users\\gtesei\\Desktop\\Deloitte\\C_Folder\\Cognitive_Technologies\\Machine_Learning\\git\\fast-furious\dataset\\facebook-v-predicting-check-ins\\test.csv', | ||
dtype={ | ||
'x': np.float32, | ||
'y': np.float32, | ||
'accuracy': np.int16, | ||
'time': np.int64}, | ||
index_col=0) # Defining the size of the grid | ||
|
||
df_train = df_train.drop(['accuracy'], 1) | ||
df_test = df_test.drop(['accuracy'], 1) | ||
|
||
n_cell_x = 10 | ||
n_cell_y = 30 | ||
|
||
assert df_train[df_train.place_id.values<0].shape[0] == 0 | ||
|
||
print('Preparing train data') | ||
df_train = prepare_data(df_train, n_cell_x, n_cell_y) | ||
|
||
print('Preparing test data') | ||
df_test = prepare_data(df_test, n_cell_x, n_cell_y) | ||
|
||
# Solving classification problems inside each grid cell | ||
th = 4 # Keeping place_ids with more than th samples. | ||
process_grid(df_train, df_test, th, n_cell_x * n_cell_y) |
Oops, something went wrong.