In [2]:
from __future__ import print_function

from IPython import display

import math
import matplotlib
import sklearn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from numpy import radians, cos, sin, arcsin, arccos, sqrt, pi, arctan2, degrees, arctan
import itertools
from datetime import datetime
from scipy import signal,ndimage, misc, stats
 
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
tqdm.pandas(tqdm_notebook)

import osrm
from joblib import dump, load

from natsort import natsorted

from xgboost import XGBRegressor

pd.options.display.max_rows = 10

In [36]:
def haversine(lat1, lon1, lat2, lon2):
    #ensure using numpy and not math, or pandas series cannot be passed
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a))
    r = 6378.137 ##radius of earth km
    return c * r

def compute_dist(df):
    next_df = df.shift(1)
    dist = haversine(df.iloc[:,0], df.iloc[:,1],
                    next_df.iloc[:,0], next_df.iloc[:,1])
    return dist

def compute_time(df):
    next_df = df.shift(1)
#     df["time"] = pd.to_datetime(df["time_utc"], format="%Y-%m-%d %H:%M:%S.%f", errors='raise')
    timedelt = df["time"] - next_df["time"]
    return timedelt

def compute_speed(df):
    kinematics = df.copy()
    kinematics["distance_travelled"] = compute_dist(kinematics[["latitude", "longitude"]].astype(float)).values
    kinematics["time_elapsed"] = compute_time(kinematics).values
    kinematics["time_elapsed_seconds"] = kinematics["time_elapsed"]/np.timedelta64(1,'s')
    kinematics["speed m/s"] = (kinematics["distance_travelled"]*1000)/kinematics["time_elapsed_seconds"]
    kinematics["speed kmh"] = kinematics["speed m/s"]*3.6
    kinematics.drop(columns = ['time_elapsed'], inplace = True)
    kinematics.fillna(0, inplace = True)
#     df["distance_travelled"] = kinematics["distance_travelled"].values
#     df["speed kmh"] = kinematics["speed kmh"].values
    return kinematics

def cal_bearing(lat1, lon1, lat2, lon2):
    """
    Calculates the bearing between two points using the formula
        θ = atan2(sin(Δlong).cos(lat2),
                  cos(lat1).sin(lat2) − sin(lat1).cos(lat2).cos(Δlong))
    """
    
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlon = lon2 - lon1

    x = sin(dlon) * cos(lat2)
    y1 = cos(lat1) * sin(lat2)
    y2 = sin(lat1) * cos(lat2) * cos(dlon)
    y = y1 - y2

    initial_bearing = arctan2(x, y)

    initial_bearing = degrees(initial_bearing)
    compass_bearing = (initial_bearing + 360) % 360

    return compass_bearing

In [37]:
chunk = 0
taxi = []
df =  pd.read_csv('/mnt/hgfs/FYP/porto_cleaned_100000.csv', parse_dates = ['time'])

In [40]:
df

Unnamed: 0.1,Unnamed: 0,time,longitude,latitude,ID
0,0,2013-07-01 00:00:58,-8.618643,41.141412,20000589
1,1,2013-07-01 00:01:28,-8.620326,41.142510,20000589
2,2,2013-07-01 00:01:43,-8.622153,41.143815,20000589
3,3,2013-07-01 00:01:58,-8.623953,41.144373,20000589
4,4,2013-07-01 00:02:13,-8.626680,41.144778,20000589
...,...,...,...,...,...
4163115,25,2013-07-21 18:59:42,-8.603550,41.168781,20000174
4163116,26,2013-07-21 18:59:57,-8.603424,41.169879,20000174
4163117,27,2013-07-21 19:00:12,-8.602002,41.169834,20000174
4163118,28,2013-07-21 19:00:27,-8.600670,41.169780,20000174


In [38]:
grouper = df.groupby('ID')

In [None]:
df.groupby('ID').groups

In [39]:
for name, val in grouper:
    idx_grper = val.groupby(val['Unnamed: 0'].diff().ne(1).cumsum()) #cause of the repeating indexes
    for name2, taxi in idx_grper:
        compute_speed(taxi)
        i = 30
        '''
        For each window, randomly select one row/time period of the vessel's journey
        The selected time frame is NOT consistent among the changing window sizes.
        '''
        i = str(i)
        avg = taxi.resample(i+'T', on='time').mean().add_suffix('_mean' + i) #downsample the dataframe
         
        break
    break

TypeError: unsupported operand type(s) for -: 'str' and 'float'

In [32]:
taxi

Unnamed: 0.1,Unnamed: 0,time,longitude,latitude,ID
24648,0,2013-07-01 07:06:43,-8.584353,41.163174,20000001
24649,1,2013-07-01 07:06:58,-8.585289,41.162994,20000001
24650,2,2013-07-01 07:07:13,-8.587512,41.163678,20000001
24651,3,2013-07-01 07:07:43,-8.589024,41.164155,20000001
24652,4,2013-07-01 07:07:58,-8.589024,41.164146,20000001
...,...,...,...,...,...
24661,13,2013-07-01 07:10:13,-8.598744,41.161158,20000001
24662,14,2013-07-01 07:10:28,-8.601471,41.161536,20000001
24663,15,2013-07-01 07:10:43,-8.603577,41.161824,20000001
24664,16,2013-07-01 07:10:58,-8.603694,41.161833,20000001
