In [None]:
!pip install mpu



In [None]:
import os
import pandas as pd
import numpy as np
import mpu
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
def find_routes_with_ten_readings(df, route_numbers, min_num_readings=10, verbose=False):
    routes = []
    
    for number in route_numbers:
        route_df = df[df['route_number'] == number]
        
        if len(route_df) >= min_num_readings:
            routes.append(route_df)
        elif verbose:
            print('Route: ', number, ' only has ', len(route_df), ' readings!')
    
    print('Found', len(routes), 'routes that have', min_num_readings, 'or more readings')
    
    return pd.concat(routes)


def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names
    
    route_ids = df['route_number'].unique()
    return find_routes_with_ten_readings(df, route_ids, min_num_readings=10)

def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date: pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)


In [None]:
def remove_routes_with_corrupt_start_end_times_and_calc_duration(df):
    route_durations = {}
    proper_route_numbers = []

    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]

        start_row = route_df[route_df['route_start'] == True]
        end_row = route_df[route_df['route_end'] == True]

        has_start_and_end = True
        if len(start_row) == 0:
            print('No start for route: ', route_id)
            has_start_and_end = False

        if len(end_row) == 0:
            print('No end for route: ', route_id)
            has_start_and_end = False

        if has_start_and_end:
            start_time = start_row['time'].iloc[0]
            end_time = end_row['time'].iloc[0]

            if start_time < end_time:
                route_duration = end_time - start_time
                duration_in_seconds = route_duration.total_seconds()
                route_durations[route_id] = duration_in_seconds
                proper_route_numbers.append(route_id)

    duration_df = pd.DataFrame(list(route_durations.items()), columns=['route_number', 'duration_in_seconds'])
    return duration_df, df[df['route_number'].isin(proper_route_numbers)]


In [None]:
# we have used this formula
"""dlon = lon2 - lon1
dlat = lat2 - lat1
a = (sin(dlat/2))^2 + cos(lat1) * cos(lat2) * (sin(dlon/2))^2
c = 2 * atan2( sqrt(a), sqrt(1-a) )
d = R * c (where R is the radius of the Earth)"""

'dlon = lon2 - lon1\ndlat = lat2 - lat1\na = (sin(dlat/2))^2 + cos(lat1) * cos(lat2) * (sin(dlon/2))^2\nc = 2 * atan2( sqrt(a), sqrt(1-a) )\nd = R * c (where R is the radius of the Earth)'

In [None]:
def distance_between_gps(gps_one, gps_two):
    # mpu.haversine_distance((lat1, lon1), (lat2, lon2))
    km_distance = mpu.haversine_distance((gps_one[0], gps_one[1]), (gps_two[0], gps_two[1]))

    if km_distance < 0:
        print('got negative distance that\'s weak')
        km_distance *= -1

    return km_distance

In [None]:
def remove_routes_with_excessive_distances(df):
    proper_route_numbers = []
    df['time'] = lookup(df['time'])
    route_ids = df['route_number'].unique()

    for route_id in route_ids:
        route_df = df[df['route_number'] == route_id]
        route_df.sort_values('time')
        route_df.reset_index(drop=True)

        distance_sum = 0.0
        is_first_row = True

        for index, row in route_df.iterrows():
            if not is_first_row:
                last_row = route_df.loc[index - 1]
                last_lat = last_row['latitude']
                last_long = last_row['longitude']
                last_gps = (last_lat, last_long)

                current_lat = row['latitude']
                current_long = row['longitude']
                current_gps = (current_lat, current_long)

                distance_between_rows = distance_between_gps(last_gps, current_gps)
                distance_sum += distance_between_rows
            else:
                is_first_row = False

        if distance_sum < 100:
            proper_route_numbers.append(route_id)
        else:
            print('Route ', route_id, ' has excessive distance: ', distance_sum)

    return df[df['route_number'].isin(proper_route_numbers)]

In [None]:
north_train_df = load_csv_as_df('north-to-west-routes-no-grids.csv', '/')
west_train_df = load_csv_as_df('west-to-north-routes-no-grids.csv', '/')


Found 207 routes that have 10 or more readings
Found 289 routes that have 10 or more readings


In [None]:
north_train_duration,north_train_df=remove_routes_with_corrupt_start_end_times_and_calc_duration(north_train_df)
west_train_duration,west_train_df=remove_routes_with_corrupt_start_end_times_and_calc_duration(west_train_df)

In [None]:
print(len(north_train_df["route_number"].unique()))
print(len(west_train_df["route_number"].unique()))

197
283


In [None]:
max_lat = 23.0
min_lat = 22.0

min_long = 113
max_long = 115

diff_in_latitude = max_lat - min_lat
diff_in_longitude = max_long - min_long

# 20 rows
cell_size = diff_in_latitude / 20.0

In [None]:
def concat(row, col):
    return str(row) + '-' + str(col)


def map_gps_to_box(latitude, longitude):
    row_number = int((latitude - min_lat) // cell_size)
    col_number = int((longitude - min_long) // cell_size)

    if col_number < 0 or row_number < 0:
        return -1, -1, -1

    cell_number_str = str(row_number) + str(col_number)

    return concat(row_number, col_number), row_number, col_number

In [None]:
def map_gps_to_cell(df):
    cells = []
    rows = []
    cols = []

    for index, row in df.iterrows():
        lat = row['latitude']
        long = row['longitude']

        cell_number, cell_row, cell_col = map_gps_to_box(lat, long)

        cells.append(cell_number)
        cols.append(cell_col)
        rows.append(cell_row)

    df['cell'] = cells
    df['row'] = rows
    df['column'] = cols

    return df     

In [None]:
north_train_df = map_gps_to_cell(north_train_df)
west_train_df = map_gps_to_cell(west_train_df)

In [None]:
west_train_df.head()

Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end,cell,row,column
0,22262,2018-12-08 09:42:28,114.121231,22.547068,1,5,500264,True,False,10-22,10,22
1,22262,2018-12-08 09:42:58,114.123497,22.547533,1,27,500264,False,False,10-22,10,22
2,22262,2018-12-08 09:43:26,114.123619,22.5478,1,14,500264,False,False,10-22,10,22
3,22262,2018-12-08 09:43:29,114.123581,22.547783,1,0,500264,False,False,10-22,10,22
4,22262,2018-12-08 09:43:32,114.123535,22.547783,1,0,500264,False,False,10-22,10,22


In [None]:
north_train_df.head()

Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end,cell,row,column
0,22391,2018-12-08 22:02:44,114.026115,22.6106,1,43,501872,True,False,12-20,12,20
1,22391,2018-12-08 22:03:04,114.023849,22.6092,1,50,501872,False,False,12-20,12,20
2,22391,2018-12-08 22:03:24,114.022003,22.607018,1,58,501872,False,False,12-20,12,20
3,22391,2018-12-08 22:03:44,114.023666,22.604549,1,58,501872,False,False,12-20,12,20
4,22391,2018-12-08 22:04:04,114.025284,22.602533,1,49,501872,False,False,12-20,12,20


In [None]:
north_train_df.to_csv('north-to-west-routes-with-cells.csv', encoding='utf-8', index=False)
west_train_df.to_csv('west-to-north-routes-with-cells.csv', encoding='utf-8', index=False)
