In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

In [77]:
def load_csv_as_df(file_name, sub_directories, col_name=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name
    
    if col_name is not None:
        return pd.read_csv(full_path, usecols=[col_name])
    
    # print('Full Path: ', full_path)
    col_names = ['old_index', 'taxi_id', 'time', 'longitude', 'latitude', 'occupancy_status', 'speed', 'route_number', 'route_start', 'route_end']
    df = pd.read_csv(full_path)
    df.columns = col_names
    df.drop('old_index', axis=1, inplace=True)
    return df


def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date:pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)


def label_trajectories(df):
    updated_dfs = []
    taxi_ids = df['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' in this data')
    empty_route = -1
    trajectory_number = 1

    completed_count = 0

    for taxi_id in taxi_ids[:10]:
        # get the df for that taxis
        taxi_df = df.loc[df['taxi_id'] == taxi_id]
        passenger_got_in = False
        route_numbers = []
        route_starts = []
        route_ends = []

        for index, row in taxi_df.iterrows():
            passenger_in_taxi = row['occupancy_status']

            # Do we already have a passenger?
            if passenger_got_in:
                if passenger_in_taxi:
                    # trajectory still going
                    route_starts.append(False)
                    route_ends.append(False)
                    route_numbers.append(trajectory_number)
                    continue
                elif not passenger_in_taxi:
                    # trajectory ended
                    passenger_got_in = False
                    route_starts.append(False)
                    route_ends.append(True)
                    route_numbers.append(trajectory_number)
                    trajectory_number += 1

            elif passenger_in_taxi:
                passenger_got_in = True
                route_starts.append(True)
                route_ends.append(False)
                route_numbers.append(trajectory_number)

            else:
                route_starts.append(False)
                route_ends.append(False)
                route_numbers.append(empty_route)

        taxi_df['route_number'] = route_numbers
        taxi_df['route_start'] = route_starts
        taxi_df['route_end'] = route_ends
        updated_dfs.append(taxi_df)
        completed_count += 1

        if completed_count % 100 == 0:
            print('Completed ', completed_count, ' taxi_ids out of ', len(taxi_ids))

    return pd.concat(updated_dfs)

In [12]:
df = load_csv_as_df('Better_taxi_data.csv', '/')

In [7]:
# df['time'] = lookup(df['time'])

In [9]:
# df.sort_values(by=['time'], inplace=True)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,time,latitude,longitude,occupancy_status,speed,route_number,route_start,route_end
0,13446550,26254,2018-10-31 00:00:00,114.026871,22.74625,0,61,-1,False,False
1,13447104,26254,2018-10-31 00:00:15,114.026733,22.743851,0,69,-1,False,False
2,13445774,26254,2018-10-31 00:00:30,114.027702,22.741716,0,72,-1,False,False
3,13445518,26254,2018-10-31 00:00:45,114.028419,22.74015,0,2,-1,False,False
4,13447364,26254,2018-10-31 00:01:00,114.028419,22.74015,0,0,-1,False,False


In [30]:
route_nums = df.route_number.unique()

for x in range(5):
    print(route_nums[x])

-1
1
2
3
4


In [58]:
def find_trajectories_at_airport_or_bus(df):
    relevant_route_numbers = []
    relevant_cols = ['latitude', 'longitude', 'route_number', 'route_start']
    route_numbers = df.route_number.unique()

    number_of_trajectories = len(route_numbers) - 1

    for route_number in route_numbers:
        if route_number != -1:

            if route_number % 100 == 0:
                print('On route number ', route_number, ' out of ', number_of_trajectories)

            start_row = df[relevant_cols][(df['route_start'] == True) & (df['route_number'] == route_number)]
            end_row = df[relevant_cols][(df['route_end'] == True) & (df['route_number'] == route_number)]

            # print(start_row)
            # print(end_row)

            start_lat = start_row['latitude'].iloc[0]
            start_long = start_row['longitude'].iloc[0]

            end_lat = end_row['latitude'].iloc[0]
            end_long = end_row['longitude'].iloc[0]

            if near_airport(start_lat, start_long) and near_bus_station(end_lat, end_long):
                relevant_route_numbers.append(route_number)
            elif near_bus_station(start_lat, start_long) and near_airport(end_lat, end_long):
                relevant_route_numbers.append(route_number)

    return relevant_route_numbers


def near_airport(lat, long):
    if 22.605770 <= lat <= 22.667089 and 113.784647 <= long <= 113.837340:
        return True
    else:
        return False


def near_bus_station(lat, long):
    if 22.567210 <= lat <= 22.568807 and 114.089676 <= long <= 114.091320:
        return True
    else:
        return False

In [95]:
starts_df = df[df['route_start'] == True]
ends_df = df[df['route_end'] == True]

In [100]:
air_starts = starts_df.query('latitude>=22.605770 and latitude<=22.667089 and longitude>=113.784647 and longitude<=113.837340')

In [101]:
bus_starts = starts_df.query('latitude>=22.567210 and latitude<=22.568807 and longitude>=114.089676 and longitude<=114.091320')

In [102]:
print('There are ', len(air_starts), ' routes that start at the airport')
print('There are ', len(bus_starts), ' routes that start at the bus station')

There are  9349  routes that start at the airport
There are  33  routes that start at the bus station


In [115]:
airport_route_numbers = air_starts.route_number.unique()

airport_routes_df = df[df['route_number'].isin(airport_route_numbers)]
airport_routes_df = airport_routes_df[airport_routes_df['route_end'] == True] # find the most common end
airport_routes_df.mode()

Unnamed: 0,old_index,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end
0,6274,36341.0,2018-10-31 11:12:54,114.077866,22.506468,0.0,0.0,2,False,True
1,14964,,,,,,,4,,
2,19645,,,,,,,5,,
3,22505,,,,,,,44,,
4,23169,,,,,,,45,,
5,24248,,,,,,,109,,
6,24697,,,,,,,124,,
7,25197,,,,,,,365,,
8,25395,,,,,,,371,,
9,26457,,,,,,,411,,


In [116]:
airport_routes_long_list = airport_routes_df.longitude.unique()
print(len(airport_routes_long_list))

7185


In [117]:
duplicate_drop_off_lat = airport_routes_df[airport_routes_df.duplicated(['latitude'], keep=False)]
duplicate_drop_off_long = airport_routes_df[airport_routes_df.duplicated(['longitude'], keep=False)]

In [121]:
# print(duplicate_drop_off_lat)
# duplicate_drop_off_lat.mode()
# print(duplicate_drop_off_long)
duplicate_drop_off_long.mode()

Unnamed: 0,old_index,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end
0,6274,36341.0,2018-10-31 19:48:48,114.077866,22.506468,0.0,0.0,44,False,True
1,19645,,,,,,,109,,
2,25197,,,,,,,124,,
3,37392,,,,,,,371,,
4,37679,,,,,,,411,,
5,43669,,,,,,,427,,
6,45009,,,,,,,432,,
7,54049,,,,,,,788,,
8,56136,,,,,,,849,,
9,76030,,,,,,,897,,


In [103]:
air_ends = ends_df.query('latitude>=22.605770 and latitude<=22.667089 and longitude>=113.784647 and longitude<=113.837340')
bus_ends = ends_df.query('latitude>=22.567210 and latitude<=22.568807 and longitude>=114.089676 and longitude<=114.091320')

In [105]:
print('There are ', len(air_ends), ' routes that end at the airport')
print('There are ', len(bus_ends), ' routes that end at the bus station')

There are  8435  routes that end at the airport
There are  24  routes that end at the bus station


In [106]:
air_start_route_numbers = air_starts.route_number.unique()
bus_end_route_numbers = bus_ends.route_number.unique()

air_to_bus_route_numbers = list(set(air_start_route_numbers) & set(bus_end_route_numbers))
print(air_to_bus_route_numbers)

[]


In [107]:
train_ends = ends_df.query('latitude>=22.605591 and latitude<=22.613606 and longitude>=114.023595 and longitude<=114.034006')
print('There are ', len(train_ends), ' routes that end at the train station')

There are  3207  routes that end at the train station


In [122]:
# 114.077866	22.506468	
park_ends = ends_df.query('latitude>=22.49 and latitude<=22.52 and longitude>=114.06 and longitude<=114.08')
print('There are ', len(park_ends), ' routes that end at the parking lot')

There are  3868  routes that end at the parking lot


In [108]:
air_start_route_numbers = air_starts.route_number.unique()
train_end_route_numbers = train_ends.route_number.unique()

air_to_train_route_numbers = list(set(air_start_route_numbers) & set(train_end_route_numbers))
print(air_to_train_route_numbers)

[320448, 321890, 311971, 23971, 476517, 499458, 375463, 466818, 387978, 283883, 452844, 524975, 99858, 298132, 396373, 109210]


In [109]:
train_starts = starts_df.query('latitude>=22.605591 and latitude<=22.613606 and longitude>=114.023595 and longitude<=114.034006')
print('There are ', len(train_starts), ' routes that end at the train station')

There are  2517  routes that end at the train station


In [110]:
air_end_route_numbers = air_ends.route_number.unique()
train_start_route_numbers = train_starts.route_number.unique()

train_to_air_route_numbers = list(set(train_start_route_numbers) & set(air_end_route_numbers))
print(train_to_air_route_numbers)


[366114, 190531, 168676, 493893, 296646, 104709, 97736, 437641, 53743, 254609, 403186, 95921, 470614, 132921, 506810, 24956, 158783]


In [111]:
all_relevant_routes = air_to_train_route_numbers + train_to_air_route_numbers
print(len(all_relevant_routes))
rel_set = set(all_relevant_routes)
print(len(rel_set))

33
33


In [None]:
relevant_df = df[df.route_number.isin(relevant_routes)]
relevant_df.to_csv('RelevantTrajectories.csv', encoding='utf-8')

In [38]:
relevant_cols = ['latitude', 'longitude', 'route_number', 'route_start']
start_row = df[relevant_cols][(df['route_start'] == True) & (df['route_number'] == 2)]
end_row = df[relevant_cols][(df['route_end'] == True) & (df['route_number'] == 2)]



In [39]:
print(start_row)

      latitude  longitude  route_number  route_start
736  113.81002  22.626383             2         True


In [40]:
print(end_row)

       latitude  longitude  route_number  route_start
783  113.855232  22.574484             2        False


In [56]:
start_lat = start_row['latitude'].iloc[0]
start_long = start_row['longitude'].iloc[0]

In [57]:
print(start_lat)
print(start_long)

113.81002
22.626383
