In [96]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

In [97]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)
        
    if column_names is not None:
        df.columns = column_names

    return df


def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date:pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)

In [98]:
def label_trajectories(df):
    updated_dfs = []
    taxi_ids = df['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' unique taxi ids in this data')
    empty_route = -1
    trajectory_number = 1

    completed_count = 0

    for taxi_id in taxi_ids:
        # get the df for that taxis
        taxi_df = df.loc[df['taxi_id'] == taxi_id]
        taxi_df['time'] = lookup(taxi_df['time'])
        taxi_df.sort_values(by=['time'], inplace=True)
        
        passenger_got_in = False
        route_numbers = []
        route_starts = []
        route_ends = []

        for index, row in taxi_df.iterrows():
            passenger_in_taxi = row['occupancy_status']

            # Do we already have a passenger?
            if passenger_got_in:
                if passenger_in_taxi:
                    # trajectory still going
                    route_starts.append(False)
                    route_ends.append(False)
                    route_numbers.append(trajectory_number)
                    continue
                elif not passenger_in_taxi:
                    # trajectory ended
                    passenger_got_in = False
                    route_starts.append(False)
                    route_ends.append(True)
                    route_numbers.append(trajectory_number)
                    trajectory_number += 1

            elif passenger_in_taxi:
                passenger_got_in = True
                route_starts.append(True)
                route_ends.append(False)
                route_numbers.append(trajectory_number)

            else:
                route_starts.append(False)
                route_ends.append(False)
                route_numbers.append(empty_route)

        taxi_df['route_number'] = route_numbers
        taxi_df['route_start'] = route_starts
        taxi_df['route_end'] = route_ends
        updated_dfs.append(taxi_df)
        completed_count += 1

        if completed_count % 1000 == 0:
            print('Completed ', completed_count, ' taxi_ids out of ', len(taxi_ids))

    return pd.concat(updated_dfs)

In [99]:
def find_trajectories_at_airport_or_bus(df):
    relevant_route_numbers = []
    relevant_cols = ['latitude', 'longitude', 'route_number', 'route_start']
    route_numbers = df.route_number.unique()

    number_of_trajectories = len(route_numbers) - 1

    for route_number in route_numbers:
        if route_number != -1:

            if route_number % 10000 == 0:
                print('On route number ', route_number, ' out of ', number_of_trajectories)
                print('Currently found ', len(relevant_route_numbers), ' relevant routes')

            start_row = df[relevant_cols][(df['route_start'] == True) & (df['route_number'] == route_number)]
            end_row = df[relevant_cols][(df['route_end'] == True) & (df['route_number'] == route_number)]

            # print(start_row)
            # print(end_row)

            start_lat = start_row['latitude'].iloc[0]
            start_long = start_row['longitude'].iloc[0]

            end_lat = end_row['latitude'].iloc[0]
            end_long = end_row['longitude'].iloc[0]

            if near_airport(start_lat, start_long) and near_train_station(end_lat, end_long):
                relevant_route_numbers.append(route_number)
            elif near_train_station(start_lat, start_long) and near_airport(end_lat, end_long):
                relevant_route_numbers.append(route_number)

    return relevant_route_numbers


def near_airport(lat, long):
    if 22.605770 <= lat <= 22.667089 and 113.784647 <= long <= 113.837340:
        return True
    else:
        return False


def near_bus_station(lat, long):
    if 22.567210 <= lat <= 22.568807 and 114.089676 <= long <= 114.091320:
        return True
    else:
        return False
    

def near_train_station(lat, long):
    if 22.604998 <= lat <= 22.614221 and 114.021111 <= long <= 114.034778:
        return True
    else:
        return False

In [100]:
col_numbers = [3, 4, 5, 6, 7, 8, 12]
col_names = ['longitude', 'latitude', 'time', 'taxi_id', 'speed', 'direction',
             'occupancy_status']

df = load_csv_as_df('part-m-00000', '/2014-04-06/', col_numbers, col_names)

In [101]:
%%time
traj_df = label_trajectories(df)

There are  4510  unique taxi ids in this data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice fro

Completed  1000  taxi_ids out of  4510
Completed  2000  taxi_ids out of  4510
Completed  3000  taxi_ids out of  4510
Completed  4000  taxi_ids out of  4510
CPU times: user 22min 22s, sys: 8.36 s, total: 22min 30s
Wall time: 23min 40s


In [102]:
relevant_route_numbers = find_trajectories_at_airport_or_bus(traj_df)

On route number  10000  out of  58454
Currently found  1  relevant routes
On route number  20000  out of  58454
Currently found  1  relevant routes
On route number  30000  out of  58454
Currently found  1  relevant routes
On route number  40000  out of  58454
Currently found  1  relevant routes
On route number  50000  out of  58454
Currently found  1  relevant routes


In [103]:
print(relevant_route_numbers)

[1061, 52401]


In [126]:
starts = traj_df.loc[traj_df['route_start'] == True]

In [127]:
# Airport starts
air_starts = starts[(starts['latitude'] >= 22.605770) & (starts['latitude'] <= 22.667089)]

print('starts after latitude: ', len(air_starts))

air_starts = air_starts[(air_starts['longitude'] >= 113.784647) & (air_starts['longitude'] <= 113.837340)]

print('starts after longitude: ', len(air_starts))

starts after latitude:  13172
starts after longitude:  940


In [128]:
air_starts.head()

Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,occupancy_status,route_number,route_start,route_end
5726,113.81015,22.661867,2014-04-06 00:03:41,1453571,86,153,1,444,True,False
6784,113.811417,22.615442,2014-04-06 14:31:08,1143434,59,150,1,527,True,False
12403,113.82,22.6124,2014-04-06 19:13:47,1141056,58,52,1,1062,True,False
17837,113.806686,22.62385,2014-04-06 15:14:31,1140904,23,181,1,1427,True,False
18131,113.809402,22.6189,2014-04-06 04:35:09,1454788,74,0,1,1437,True,False


In [129]:
# Train Ends
ends = traj_df.loc[traj_df['route_end'] == True]
train_ends = ends[(ends['latitude'] >= 22.604998) & (ends['latitude'] <= 22.614221)]

print('Train ends after latitude: ', len(train_ends))

train_ends = train_ends[(train_ends['longitude'] >= 114.021111) & (train_ends['longitude'] <= 114.034778)]

print('Train ends after longitude: ', len(train_ends))

Train ends after latitude:  2627
Train ends after longitude:  391


In [130]:
air_start_route_numbers = air_starts['route_number'].unique()
train_end_route_numbers = train_ends['route_number'].unique()

In [131]:
print(len(air_start_route_numbers))
print(len(train_end_route_numbers))

940
391


In [132]:
air_to_train = list(set(air_start_route_numbers) & set(train_end_route_numbers))
print(len(air_to_train))

0


In [133]:
# Train Starts
train_starts = starts[(starts['latitude'] >= 22.604998) & (starts['latitude'] <= 22.614221)]

print('Train ends after latitude: ', len(train_starts))

train_starts = train_starts[(train_starts['longitude'] >= 114.021111) & (train_starts['longitude'] <= 114.034778)]

# Airport Ends
air_ends = ends[(ends['latitude'] >= 22.605770) & (ends['latitude'] <= 22.667089)]

print('starts after latitude: ', len(air_ends))

air_ends = air_ends[(air_ends['longitude'] >= 113.784647) & (air_ends['longitude'] <= 113.837340)]

print('starts after longitude: ', len(air_ends))

Train ends after latitude:  2790
starts after latitude:  13076
starts after longitude:  1149


In [134]:
air_end_route_numbers = air_ends['route_number'].unique()
train_start_route_numbers = train_starts['route_number'].unique()

train_to_air = list(set(air_end_route_numbers) & set(train_start_route_numbers))
print(len(train_to_air))

2


In [135]:
print(train_to_air)

[52401, 1061]


In [145]:
route_df = traj_df.loc[traj_df['route_number'] == 1061]

In [146]:
route_df.head(len(route_df))

Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,occupancy_status,route_number,route_start,route_end
12495,114.021797,22.611032,2014-04-06 16:36:39,1141056,23,225,1,1061,True,False
12557,114.056465,22.578951,2014-04-06 16:45:29,1141056,41,124,1,1061,False,False
12562,114.06665,22.5686,2014-04-06 16:48:22,1141056,43,108,1,1061,False,False
12563,114.068649,22.568951,2014-04-06 16:49:07,1141056,31,75,1,1061,False,False
12579,114.104652,22.573418,2014-04-06 16:53:40,1141056,77,57,1,1061,False,False
12586,114.105614,22.573999,2014-04-06 16:53:47,1141056,72,57,1,1061,False,False
12609,114.119003,22.561783,2014-04-06 16:57:04,1141056,66,161,1,1061,False,False
12610,114.124664,22.554283,2014-04-06 17:00:06,1141056,0,337,1,1061,False,False
12590,114.10778,22.54755,2014-04-06 17:13:55,1141056,0,238,1,1061,False,False
12592,114.110016,22.542601,2014-04-06 17:16:20,1141056,0,169,1,1061,False,False


In [148]:
y = df[df['taxi_id'] == 1142326]
y['time'] = lookup(y['time'])
y.sort_values(by=['time'], inplace=True)
y.head(len(y))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,occupancy_status
707535,113.797272,22.677549,2014-04-06 00:05:13,1142326,0,350,0
707536,113.799408,22.672390,2014-04-06 00:06:59,1142326,61,120,0
707540,113.833206,22.624706,2014-04-06 00:12:58,1142326,89,160,0
707537,113.812515,22.627260,2014-04-06 00:16:58,1142326,45,330,0
707547,113.840454,22.607096,2014-04-06 00:26:59,1142326,0,150,1
707604,113.892685,22.551693,2014-04-06 00:40:59,1142326,0,50,1
707600,113.891647,22.565266,2014-04-06 00:46:28,1142326,53,40,0
707597,113.891006,22.569506,2014-04-06 00:47:54,1142326,0,310,0
707609,113.895599,22.568714,2014-04-06 00:52:58,1142326,60,220,1
707608,113.895515,22.558693,2014-04-06 00:57:59,1142326,18,320,0


In [149]:
df2 = load_csv_as_df('part-m-00001', '/2014-04-06/', col_numbers, col_names)
y2 = df2[df2['taxi_id'] == 1142326]
y2['time'] = lookup(y2['time'])
y2.sort_values(by=['time'], inplace=True)
y2.head(len(y))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,occupancy_status


In [118]:
print(near_train_station(22.611032, 114.021797))

True


In [110]:
near_train_station(22.612780, 114.029549)

True

In [112]:
near_airport(22.571199, 113.892601)

False

In [113]:
near_train_station(22.571199, 113.892601)

False

In [114]:
near_bus_station(22.571199, 113.892601)

False

In [43]:
taxi_df['route_start'] = taxi_df.apply(lambda row: label_starts(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [46]:
taxi_df['route_end'] = taxi_df.apply(lambda row: label_ends(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
taxi_df.head()

Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,initial_passenger_status,final_passenger_status,route_start,route_end
331100,113.846298,22.6105,2014-04-06 00:00:00,1454801,53,256,0,1,True,False
331091,113.842102,22.611799,2014-04-06 00:04:37,1454801,66,0,0,0,False,False
331044,113.809898,22.6271,2014-04-06 00:11:22,1454801,14,0,0,0,False,False
331043,113.809196,22.6194,2014-04-06 00:15:23,1454801,64,0,0,1,True,False
331104,113.855499,22.632,2014-04-06 00:20:53,1454801,70,0,0,1,True,False


In [44]:
start_df = taxi_df.loc[taxi_df['route_start'] == True]
print(len(start_df))

148


In [47]:
end_df = taxi_df.loc[taxi_df['route_end'] == True]
print(len(end_df))

0


In [49]:
initial_df = df.loc[df['initial_passenger_status'] == 1]
initial_df.head()

Unnamed: 0,longitude,latitude,time,taxi_id,speed,direction,initial_passenger_status,final_passenger_status
