# EY Data Science Challenge 2019

# Feature Engineering - Flattening Trajectory Sequences

The feature engineering in this script is basic. It falttens the arbitrary length lists of trajectories for each hash code into a single row by taking information from the first and last trajectories, and discarding any that occur between them.

In [176]:
import pandas as pd
import numpy as np

## Read in the train raw data

In [178]:
dtype = {
    "vmax" : np.float64,
    "vmin" : np.float64,
    "vmean" : np.float64,
    "x_entry" : np.float64,
    "y_entry" : np.float64,
    "x_exit" : np.float64,
    "y_exit" : np.float64
}

In [179]:
# read in the training data
train = pd.read_csv("data/data_train.csv", dtype=dtype, index_col=0)
train['time_entry'] = pd.to_datetime(train['time_entry'])
train['time_exit'] = pd.to_datetime(train['time_exit'])

In [180]:
# find all hash groups with only 1 trajectory
hash_groups = train[["hash", "trajectory_id"]].groupby("hash").agg("count")
single_trajectory_groups = hash_groups[ hash_groups["trajectory_id"] <= 1 ]
single_trajectory_hashes = list(single_trajectory_groups.index)

In [181]:
# print out the trajectories belonging to the single hashes
train = train[ ~train["hash"].isin(single_trajectory_hashes) ].reset_index()

## Feature engineering

In [182]:
def in_center(x, y):
    """
    Used to generate the binary label over the training data.
    """
    center_lat_min = np.float64(3750901.5068)
    center_lat_max = np.float64(3770901.5068)
    center_lon_min = np.float64(-19268905.6133)
    center_lon_max = np.float64(-19208905.6133)
    return (center_lat_min <= x <= center_lat_max) and\
           (center_lon_min <= y <= center_lon_max)

In [183]:
def compute_seconds_from_midnight(timestamp):
    """
    Converts timestamp into an integer (number of seconds from midnight).
    """
    return timestamp.hour * 3600 + timestamp.minute * 60 + timestamp.second

In [184]:
from math import sqrt
def compute_average_velocity(x0, y0, t0, x1, y1, t1):
    """
    Computes average velocity (distance / second) over an interval.
    """
    distance = sqrt( (x1-x0)**2 + (y1-y0)**2 )
    time = t1 - t0
    if time == 0:
        return 0
    else:
        return distance / time

In [185]:
def extrapolate_final_destination(x0, y0, x1, y1, t1, vel):
    """
    Uses linear extrapolation to impute projected x, y coordate.
    """
    end_time = 15 * 3600
    seg_length = (end_time - t1) * vel
    x_delta = 0
    y_delta = 0
    x_proj = x1 + x_delta
    y_proj = y1 + y_delta
    return x_proj, y_proj

In [186]:
def process_hash_group(hash_group, is_train=True):
    """
    Feature engineering over the set of trajectories belonging to a single hash group:
        (1) Extracts the start coord (x0,y0) from the first trajectory
        (2) Extracts the end coord (x1,y1) from the last trajectory
        (3) Computes average velocity over the start and end trajectory
        (4) Computes extrapolated destination using interpolated trajectory and velocity
        (4) TODO: Computes distance between (x1, y1) to the city center
        (5) TODO: computes angle between (x1,y1) to the city center
        
    If is_train=True, then also computes the binary label over the final trajectory in the hash group.
    """
    # dictionary to store engineered features for the hash group
    features = {}
    
    # extract the start and end trajectory from the hash group
    start = hash_group.iloc[0]
    end = hash_group.iloc[-1]
    
    features["x0"] = start["x_entry"]
    features["y0"] = start["y_entry"]
    features["t0"] = compute_seconds_from_midnight(start["time_entry"])
    
    features["x1"] = end["x_exit"]
    features["y1"] = end["y_exit"]
    features["t1"] = compute_seconds_from_midnight(end["time_exit"])
    
    features["vel"] = compute_average_velocity(
        features["x0"], features["y0"], features["t0"],
        features["x1"], features["y1"], features["t1"]
    )
    
    features["x_proj"], features["y_proj"] = extrapolate_final_destination(
        features["x0"], features["y0"],
        features["x1"], features["y1"],
        features["t1"], features["vel"]
    )
    
    # if training data, check whether hash group ends in the city center
    if is_train:
        dest = hash_group.iloc[-1]
        features["in_center"] = in_center(dest["x_exit"], dest["y_exit"])
    
    # pandas DataFrame constructor requires dict elements to be lists
    for key, value in features.items():
        features[key] = [value]
    
    return pd.DataFrame.from_dict(features)

## Main function

In [187]:
def process_data(df, is_train=True):
    data = pd.DataFrame()
    for hs in df["hash"].unique():
        hash_group = df[ df["hash"] == hs ]
        feature_vector = process_hash_group(hash_group, is_train)
        data = data.append(feature_vector, ignore_index=True)
    return data

In [188]:
train_fe = process_data(train)

KeyboardInterrupt: 

In [None]:
train_fe.to_csv("data/data_train_fe.csv")