## Import libraries

In [None]:
import io
import math
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import time
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.layers import Activation, Bidirectional, Dense, Dropout, LSTM, GRU
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler

## Load data

In [None]:
df_train = pd.read_csv('data_train.csv', index_col=0)
df_test = pd.read_csv('data_test.csv', index_col=0)

## Initialize

In [None]:
X_MIN = 3750901.5068
X_MAX = 3770901.5068
Y_MIN = -19268905.6133
Y_MAX = -19208905.6133

In [None]:
def get_target_value(df, col1='x', col2='y'):
    for idx in range(len(df)):
        if df.at[idx, col1] >= X_MIN and df.at[idx, col1] <= X_MAX and df.at[idx, col2] >= Y_MIN and df.at[idx, col2] <= Y_MAX:
            df.at[idx, 'target'] = 1
        else:
            df.at[idx, 'target'] = 0
    return df['target'].astype('int')

## Method 1

In [None]:
def preprocess(df):
    df = df.drop(['vmax', 'vmin', 'vmean'], axis=1)

    df_entry = df[['hash', 'trajectory_id', 'time_entry', 'x_entry', 'y_entry']].copy()
#     df_entry['is_exit'] = 0
    df_entry.rename(columns={'time_entry': 'time', 'x_entry': 'x', 'y_entry': 'y'}, inplace=True)
    df_exit = df[['hash', 'trajectory_id', 'time_exit', 'x_exit', 'y_exit']].copy()
#     df_exit['is_exit'] = 1
    df_exit.rename(columns={'time_exit': 'time', 'x_exit': 'x', 'y_exit': 'y'}, inplace=True)

    df = pd.concat([df_entry, df_exit], ignore_index=True)

#     df['step'] = df['trajectory_id'].apply(lambda x: int(x.split('_')[3]))

    df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S')
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    df['second'] = df['time'].dt.second

    df = df.sort_values(['hash', 'time']).reset_index(drop=True)
    
#     df['distance'] = 0.0
#     df['time_delta'] = 0.0
#     df['direction'] = 0.0
#     # keys = list(enumerate(pd.unique(df['hash'])))

#     for device_id in df.hash.unique():
#         index_list = df[df['hash'] == device_id].index
#         for index in index_list[1:]:
#             df.at[index, 'distance'] = np.sqrt((df.at[index, 'x'] - df.at[index - 1, 'x']) ** 2 +
#                                                (df.at[index, 'y'] - df.at[index - 1, 'y']) ** 2)
#             df.at[index, 'time_delta'] = (df.at[index, 'time'] - df.at[index - 1, 'time']).seconds
#             dot = df.at[index, 'x'] * df.at[index - 1, 'x'] + df.at[index, 'y'] * df.at[index - 1, 'y']
#             det = df.at[index, 'x'] * df.at[index - 1, 'y'] - df.at[index, 'y'] * df.at[index - 1, 'x']
#             df.at[index, 'direction'] = math.degrees(math.atan2(det, dot))
    
#     df = df.drop(['time'], axis=1)
    
    return df

In [None]:
df = preprocess(df_train)

In [None]:
hash_count = df.groupby('hash').size().reset_index(name='count')
# df = df[~df['hash'].isin(hash_count[hash_count['count'] <= 2]['hash'])]

In [None]:
scaler = MinMaxScaler()
num_cols = list(df.columns.drop(['hash', 'trajectory_id', 'time']))
scaler.fit(df[num_cols])
df[num_cols] = scaler.transform(df[num_cols])

In [None]:
y = df.sort_values('time').groupby('hash').tail(1).sort_values('hash')
X = df[~df.index.isin(y.index)].sort_values(['hash', 'time'])

In [None]:
group = X.groupby('hash').cumcount()
X = (X.drop(['trajectory_id', 'time'], axis=1)
      .set_index(['hash', group])
      .unstack(fill_value=0)
      .stack().groupby(level=0)
      .apply(lambda x: x.values.tolist())
      .tolist())
y = y[['x', 'y']].values.tolist()

In [None]:
full_seqlen = hash_count.reset_index(drop=True)
# full_seqlen = hash_count[hash_count['count'] > 2].reset_index(drop=True)
feature_len = len(df.drop(['hash', 'trajectory_id', 'time'], axis=1).columns)
max_timestep = max(hash_count['count'])

In [None]:
train_size = int(len(full_seqlen) * 0.8)
train_indices = random.sample(range(len(full_seqlen)), train_size)
test_indices = list(set(range(len(full_seqlen))) - set(train_indices))

X_train = np.asarray([X[i] for i in train_indices])
X_test = np.asarray([X[i] for i in test_indices])

y_train = np.asarray([y[i] for i in train_indices])
y_test = np.asarray([y[i] for i in test_indices])

In [None]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

In [None]:
def build_model(layers):
    model = Sequential()

    for x in range(0, 2):
        model.add(Bidirectional(GRU(units=layers[1], input_shape=(None, layers[0]), return_sequences=True)))
        model.add(Dropout(rate=0.1))

    model.add(Bidirectional(GRU(layers[2], return_sequences=False)))
    model.add(Dropout(rate=0.1))

    model.add(Dense(units=layers[2]))
    model.add(Activation('tanh'))

    start = time.time()
    model.compile(optimizer='adam', loss='mse', metrics=['acc', 'mse', 'mae'])
    print("Compilation Time : ", time.time() - start)
    return model

In [None]:
model = build_model([feature_len, max_timestep, 2])

In [None]:
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
model.fit(X_train, y_train, batch_size=128, epochs=200, validation_data=(X_test, y_test), verbose=1, callbacks=[checkpointer])

In [None]:
# model.summary()

In [None]:
# model.load_weights('weights.hdf5')

In [None]:
trainScore = model.evaluate(X_train, y_train, verbose=0)
print('Train Score: %.4f MSE (%.4f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))

testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.4f MSE (%.4f RMSE)' % (testScore[0], math.sqrt(testScore[0])))

In [None]:
final_df = preprocess(df_test)

final_indices = final_df[final_df.isnull().any(axis=1)].index

final_df[num_cols] = scaler.transform(final_df[num_cols].fillna(0))

final_X = final_df[~final_df.index.isin(final_indices)]

final_group = final_X.groupby('hash').cumcount()
final_X = (final_X.drop(['trajectory_id', 'time'], axis=1)
                  .set_index(['hash', final_group])
                  .unstack(fill_value=0)
                  .stack().groupby(level=0)
                  .apply(lambda x: x.values.tolist())
                  .tolist())

pred = model.predict(np.asarray(final_X))

final_result = final_df[final_df.index.isin(final_indices)][['trajectory_id', 'hour', 'minute', 'second']].reset_index(drop=True).copy()

final_result[['x', 'y']] = pd.DataFrame(pred)
final_result[num_cols] = scaler.inverse_transform(final_result[num_cols])

final_result['target'] = get_target_value(final_result)

final_result[['trajectory_id', 'target']].to_csv('submission.csv', header=['id', 'target'], index=False)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(df['x'], df['y'], c=df['actual'])

## Method 2

In [None]:
def preprocess(df):
    df = df.drop(['vmax', 'vmin', 'vmean'], axis=1)

    df['step'] = df['trajectory_id'].apply(lambda x: int(x.split('_')[3]))

    df['time_entry'] = pd.to_datetime(df['time_entry'], format='%H:%M:%S')
    df['time_entry_hour'] = df['time_entry'].dt.hour
    df['time_entry_minute'] = df['time_entry'].dt.minute
    df['time_entry_second'] = df['time_entry'].dt.second
    df['time_exit'] = pd.to_datetime(df['time_exit'], format='%H:%M:%S')
    df['time_exit_hour'] = df['time_exit'].dt.hour
    df['time_exit_minute'] = df['time_exit'].dt.minute
    df['time_exit_second'] = df['time_exit'].dt.second

    df['time_delta'] = (df['time_exit'] - df['time_entry']).dt.seconds

    start_end = df.groupby(['hash']).agg({'step': [np.min, np.max]}).reset_index()
    start_end.columns = ['_'.join(tup).rstrip('_') for tup in start_end.columns.values]
    df = df.merge(start_end, how='left', on='hash')

    for idx in range(len(df)):
        if df.at[idx, 'x_entry'] >= X_MIN and df.at[idx, 'x_entry'] <= X_MAX and df.at[idx, 'y_entry'] >= Y_MIN and df.at[idx, 'y_entry'] <= Y_MAX:
            df.at[idx, 'entry_in_cc'] = 1
        else:
            df.at[idx, 'entry_in_cc'] = 0

        if df.at[idx, 'step'] == df.at[idx, 'step_amin']:
            df.at[idx, 'is_start_point'] = 1
            df.at[idx, 'is_end_point'] = 0
            df.at[idx, 'is_other_point'] = 0
        elif df.at[idx, 'step'] == df.at[idx, 'step_amax']:
            df.at[idx, 'is_start_point'] = 0
            df.at[idx, 'is_end_point'] = 1
            df.at[idx, 'is_other_point'] = 0
        else:
            df.at[idx, 'is_start_point'] = 0
            df.at[idx, 'is_end_point'] = 0
            df.at[idx, 'is_other_point'] = 1

    features = ['entry_in_cc', 'is_start_point', 'is_end_point', 'is_other_point']
    for feature in features:
        df[feature] = df[feature].astype('int')

    cols_to_drop = ['time_entry', 'time_exit', 'step_amin', 'step_amax']
    df = df.drop(cols_to_drop, axis=1)

    df = df.sort_values(['hash', 'step']).reset_index(drop=True)

    return df

In [None]:
df = preprocess(df_train)

In [None]:
hash_count = df.groupby('hash').size().reset_index(name='count')
df = df[~df['hash'].isin(hash_count[hash_count['count'] <= 1]['hash'])]

In [None]:
scaler = MinMaxScaler()
num_cols = list(df.columns.drop(['hash', 'trajectory_id']))
scaler.fit(df[num_cols])
df[num_cols] = scaler.transform(df[num_cols])

In [None]:
X = df.drop(['x_exit', 'y_exit'], axis=1)
y = df[['hash', 'trajectory_id', 'x_exit', 'y_exit']]

group = X.groupby('hash').cumcount()
X = (X.drop(['trajectory_id'], axis=1)
      .set_index(['hash', group])
      .unstack(fill_value=0)
      .stack().groupby(level=0)
      .apply(lambda x: x.values.tolist())
      .tolist())
y = (y.drop(['trajectory_id'], axis=1)
      .set_index(['hash', group])
      .unstack(fill_value=0)
      .stack().groupby(level=0)
      .apply(lambda x: x.values.tolist())
      .tolist())

In [None]:
full_seqlen = hash_count[hash_count['count'] > 1].reset_index(drop=True)
feature_len = len(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit'], axis=1).columns)
max_timestep = max(hash_count['count'])

In [None]:
train_size = int(len(full_seqlen) * 0.8)
train_indices = random.sample(range(len(full_seqlen)), train_size)
test_indices = list(set(range(len(full_seqlen))) - set(train_indices))

X_train = np.asarray([X[i] for i in train_indices])
X_test = np.asarray([X[i] for i in test_indices])

y_train = np.asarray([y[i] for i in train_indices])
y_test = np.asarray([y[i] for i in test_indices])

In [None]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

In [None]:
def build_model(layers):
    model = Sequential()

    for x in range(0, 3):
        model.add(LSTM(return_sequences=True, input_shape=(None, layers[0]), units=layers[1]))
        model.add(Dropout(0.1))

    model.add(Dense(units=layers[2], input_shape=(None, layers[0])))
    model.add(Activation('tanh'))

    start = time.time()
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy', 'mae'])
    print("Compilation Time : ", time.time() - start)
    return model

In [None]:
model = build_model([feature_len, max_timestep, 2])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=512, epochs=200, validation_split=0.1, verbose=1)

In [None]:
trainScore = model.evaluate(X_train, y_train, verbose=0)
print('Train Score: %.4f MSE (%.4f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))

testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.4f MSE (%.4f RMSE)' % (testScore[0], math.sqrt(testScore[0])))

In [None]:
final_df = preprocess(df_test)
final_df[num_cols] = scaler.transform(final_df[num_cols])

final_indices = final_df[final_df.isnull().any(axis=1)].index
final_X = final_df[~final_df.index.isin(final_indices)]

final_group = final_X.groupby('hash').cumcount()
final_X = (final_X.drop(['trajectory_id'], axis=1)
                  .set_index(['hash', final_group])
                  .unstack(fill_value=0)
                  .stack().groupby(level=0)
                  .apply(lambda x: x.values.tolist())
                  .tolist())

pred = model.predict(np.asarray(final_X))

final_result = final_df[final_df.index.isin(final_indices)].drop(['x_exit', 'y_exit'], axis=1).reset_index(drop=True).copy()
final_result[num_cols] = pd.DataFrame(pred)
final_result[num_cols] = scaler.inverse_transform(final_result[num_cols])

final_result['target'] = get_target_value(final_result, 'x_exit', 'y_exit')

final_result[['trajectory_id', 'target']].to_csv('/tmp/submission.csv', header=['id', 'target'], index=False)

https://github.com/sheilaalemany/hurricane-rnn/blob/master/hurricane-rnn-sheils.ipynb

https://github.com/Oceanland-428/Pedestrian-Trajectories-Prediction-with-RNN/blob/master/read_data.py

https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py