In [1]:
import pandas as pd
import os

In [2]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names

    return df

In [4]:
def load_data(file_name, sub_directories, with_routes=False):
    if not with_routes:
        col_names = ['taxi_id', 'time', 'longitude', 'latitude',  'occupancy_status', 'speed']
    else:
        col_names += ['route_number', 'route_start', 'route_end']

    df = load_csv_as_df(file_name, sub_directories, None, col_names)

    return df

In [5]:
df = load_data('TaxiData', '/')

In [6]:
def filter_data_by_gps(df, with_pass=False):
    # Airport in Shenzhen is 22.627078, 113.804928 and 22.606742, 113.827262.
    # Train Station in Shenzhen is 22.605502, 114.023724 and 22.613580, 114.034568.

    all_taxi_ids = df['taxi_id'].unique()
    print('There are ', len(all_taxi_ids), ' taxi ids in this dataset!')

    near_lat = df[(df['latitude'] >= 22.606742) & (df['latitude'] <= 22.627078)]
    print('There are ', len(near_lat), ' GPS readings near the latitude of the airport')

    near_airport = near_lat[(near_lat['longitude'] >= 113.804928) & (near_lat['longitude'] <= 113.827262)]

    print('There are ', len(near_airport), ' GPS readings near the airport!')
    taxi_ids = near_airport['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' taxi ids near the airport!')

    if with_pass:
        with_pass = near_airport[near_airport['occupancy_status'] == 1]
        print('There are ', len(with_pass), ' GPS readings near the airport with a passenger!')
        with_pass_ids = with_pass['taxi_id'].unique()
        print('There are ', len(with_pass_ids), ' taxi ids near the airport with a passenger!')
        return with_pass
    else:
        return near_airport

In [7]:
near_airport = filter_data_by_gps(df)

There are  14728  taxi ids in this dataset!
There are  3939626  GPS readings near the latitude of the airport
There are  561552  GPS readings near the airport!
There are  7500  taxi ids near the airport!


In [8]:
def get_taxi_data_near_airport_data(near_airport, full_df):
    taxi_ids = near_airport['taxi_id'].unique()
    
    relevant_taxis = full_df[full_df['taxi_id'].isin(taxi_ids)]
    
    return relevant_taxis

In [9]:
relevant_df = get_taxi_data_near_airport_data(near_airport, df)

In [10]:
print(len(relevant_df))

23537617


In [11]:
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date: pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)

In [12]:
def label_trajectories(df):
    df['time'] = lookup(df['time'])
    updated_dfs = []
    taxi_ids = df['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' in this data')
    empty_route = -1
    trajectory_number = 1

    completed_count = 0

    for taxi_id in taxi_ids:
        # get the df for that taxis
        taxi_df = df.loc[df['taxi_id'] == taxi_id]
        taxi_df.sort_values(by=['time'], inplace=True)
        passenger_got_in = False
        route_numbers = []
        route_starts = []
        route_ends = []

        for index, row in taxi_df.iterrows():
            passenger_in_taxi = row['occupancy_status']

            # Do we already have a passenger?
            if passenger_got_in:
                if passenger_in_taxi:
                    # trajectory still going
                    route_starts.append(False)
                    route_ends.append(False)
                    route_numbers.append(trajectory_number)
                    continue
                elif not passenger_in_taxi:
                    # trajectory ended
                    passenger_got_in = False
                    route_starts.append(False)
                    route_ends.append(True)
                    route_numbers.append(trajectory_number)
                    trajectory_number += 1

            elif passenger_in_taxi:
                passenger_got_in = True
                route_starts.append(True)
                route_ends.append(False)
                route_numbers.append(trajectory_number)

            else:
                route_starts.append(False)
                route_ends.append(False)
                route_numbers.append(empty_route)

        taxi_df['route_number'] = route_numbers
        taxi_df['route_start'] = route_starts
        taxi_df['route_end'] = route_ends
        updated_dfs.append(taxi_df)
        completed_count += 1

        if completed_count % 100 == 0:
            print('Completed ', completed_count, ' taxi_ids out of ', len(taxi_ids))

    return pd.concat(updated_dfs)

In [13]:
# labeled = label_trajectories(relevant_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


There are  7500  in this data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Completed  100  taxi_ids out of  7500
Completed  200  taxi_ids out of  7500
Completed  300  taxi_ids out of  7500
Completed  400  taxi_ids out of  7500
Completed  500  taxi_ids out of  7500
Completed  600  taxi_ids out of  7500
Completed  700  taxi_ids out of  7500
Completed  800  taxi_ids out of  7500
Completed  900  taxi_ids out of  7500
Completed  1000  taxi_ids out of  7500
Completed  1100  taxi_ids out of  7500
Completed  1200  taxi_ids out of  7500
Completed  1300  taxi_ids out of  7500
Completed  1400  taxi_ids out of  7500
Completed  1500  taxi_ids out of  7500
Completed  1600  taxi_ids out of  7500
Completed  1700  taxi_ids out of  7500
Completed  1800  taxi_ids out of  7500
Completed  1900  taxi_ids out of  7500
Completed  2000  taxi_ids out of  7500
Completed  2100  taxi_ids out of  7500
Completed  2200  taxi_ids out of  7500
Completed  2300  taxi_ids out of  7500
Completed  2400  taxi_ids out of  7500
Completed  2500  taxi_ids out of  7500
Completed  2600  taxi_ids out of  

In [14]:
# labeled.to_csv('original-labeled-routes.csv', encoding='utf-8')

In [18]:
def filter_data_by_train_gps(df, with_pass=False):
    # Train Station in Shenzhen is 22.605502, 114.023724 and 22.613580, 114.034568.
    all_taxi_ids = df['taxi_id'].unique()
    print('There are ', len(all_taxi_ids), ' taxi ids in this dataset!')

    near_lat = df[(df['latitude'] >= 22.605502) & (df['latitude'] <= 22.613580)]
    print('There are ', len(near_lat), ' GPS readings near the latitude of the airport')

    near_airport = near_lat[(near_lat['longitude'] >= 114.023724) & (near_lat['longitude'] <= 114.034568)]

    print('There are ', len(near_airport), ' GPS readings near the airport!')
    taxi_ids = near_airport['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' taxi ids near the airport!')

    if with_pass:
        with_pass = near_airport[near_airport['occupancy_status'] == 1]
        print('There are ', len(with_pass), ' GPS readings near the airport with a passenger!')
        with_pass_ids = with_pass['taxi_id'].unique()
        print('There are ', len(with_pass_ids), ' taxi ids near the airport with a passenger!')
        return with_pass
    else:
        return near_airport

In [24]:
with open('original-airport_to_train_route_numbers.txt', 'w') as f:
    for route_number in air_to_train:
        f.write("%s\n" % route_number)

In [25]:
with open('original-train_to_airport_route_numbers.txt', 'w') as f:
    for route_number in train_to_air:
        f.write("%s\n" % route_number)

In [41]:
base_path = os.getcwd()
full_path = base_path + '/original-labeled-routes.csv'
labeled = pd.read_csv(full_path)
labeled.drop(labeled.columns[0], axis=1)
labeled = labeled[labeled['route_number'] != -1]

In [42]:
labeled.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end
13,6602,22224,2018-11-25 00:03:28,114.034332,22.554117,1,53,1,True,False
14,4889,22224,2018-11-25 00:03:43,114.033501,22.552917,1,41,1,False,False
15,2675,22224,2018-11-25 00:06:04,114.031998,22.54875,0,3,1,False,True
40,3705,22224,2018-11-25 00:13:56,114.040421,22.551434,1,17,2,True,False
41,3490,22224,2018-11-25 00:14:11,114.039886,22.552834,1,33,2,False,False


In [43]:
# Change the route numbers in this CSV so that they start after the route numbers in the other CSV file
# Add 172592 to each route number
labeled["route_number"] = labeled["route_number"] + 172592 
labeled.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end
13,6602,22224,2018-11-25 00:03:28,114.034332,22.554117,1,53,172593,True,False
14,4889,22224,2018-11-25 00:03:43,114.033501,22.552917,1,41,172593,False,False
15,2675,22224,2018-11-25 00:06:04,114.031998,22.54875,0,3,172593,False,True
40,3705,22224,2018-11-25 00:13:56,114.040421,22.551434,1,17,172594,True,False
41,3490,22224,2018-11-25 00:14:11,114.039886,22.552834,1,33,172594,False,False


In [44]:
# labeled.to_csv('original-labeled-routes-updated.csv', encoding='utf-8')

In [45]:
labeled_starts = labeled[labeled['route_start'] == True]
print(len(labeled_starts))

labeled_ends = labeled[labeled['route_end'] == True]
print(len(labeled_ends))

280008


In [47]:
air_starts = filter_data_by_gps(labeled_starts, with_pass=False)
print(len(air_starts))

train_ends = filter_data_by_train_gps(labeled_ends)
print(len(train_ends))

There are  5655  taxi ids in this dataset!
There are  20192  GPS readings near the latitude of the airport
There are  7047  GPS readings near the airport!
There are  4224  taxi ids near the airport!
7047
There are  5578  taxi ids in this dataset!
There are  7141  GPS readings near the latitude of the airport
There are  1427  GPS readings near the airport!
There are  1073  taxi ids near the airport!
1427


In [48]:
route_end_numbers = train_ends['route_number'].unique()
route_start_numbers = air_starts['route_number'].unique()
air_to_train = list(set(route_start_numbers) & set(route_end_numbers))
print(len(air_to_train))

15


In [57]:
print(air_to_train)

[277593, 235842, 344327, 384940, 297166, 337840, 320657, 208721, 343667, 178778, 228565, 262969, 329370, 293275, 433086]


In [49]:
train_starts = filter_data_by_train_gps(labeled_starts)
air_ends = filter_data_by_gps(labeled_ends, with_pass=False)

print(len(train_starts))
print(len(air_ends))

There are  5655  taxi ids in this dataset!
There are  6864  GPS readings near the latitude of the airport
There are  1109  GPS readings near the airport!
There are  865  taxi ids near the airport!
There are  5578  taxi ids in this dataset!
There are  17466  GPS readings near the latitude of the airport
There are  3291  GPS readings near the airport!
There are  2435  taxi ids near the airport!
1109
3291


In [50]:
air_end_numbers = air_ends['route_number'].unique()
train_start_numbers = train_starts['route_number'].unique()

train_to_air = list(set(air_end_numbers) & set(train_start_numbers))
print(len(train_to_air))

4


In [51]:
print(train_to_air)

[428977, 375971, 324340, 445438]


In [58]:
x = labeled[labeled['route_number'] == 235842]
x.head()
print(len(x))

Unnamed: 0.1,Unnamed: 0,taxi_id,time,longitude,latitude,occupancy_status,speed,route_number,route_start,route_end
5438778,11069733,25533,2018-11-25 18:58:12,113.809944,22.626711,1,2,235842,True,False
5438779,11069284,25533,2018-11-25 18:58:46,113.808609,22.625177,1,42,235842,False,False
5438780,11069732,25533,2018-11-25 18:59:46,113.809433,22.618975,1,69,235842,False,False
5438781,11070221,25533,2018-11-25 19:00:46,113.814407,22.610636,1,56,235842,False,False
5438782,11069002,25533,2018-11-25 19:01:46,113.821671,22.613285,1,67,235842,False,False


In [59]:
print(len(x))

51


In [70]:
train_to_air_route_df = labeled[labeled['route_number'].isin(train_to_air)]
train_to_air_route_df = train_to_air_route_df.drop(train_to_air_route_df.columns[0], axis=1)
train_to_air_route_df = train_to_air_route_df.drop(train_to_air_route_df.columns[5], axis=1)
train_to_air_route_df.to_csv('train-to-air-routes-no-grids.csv', encoding='utf-8', index=False)

In [71]:
air_to_air_train_df = labeled[labeled['route_number'].isin(air_to_train)]
air_to_air_train_df = air_to_air_train_df.drop(air_to_air_train_df.columns[0], axis=1)
air_to_air_train_df = air_to_air_train_df.drop(air_to_air_train_df.columns[5], axis=1)
air_to_air_train_df.to_csv('air-to-train-routes-no-grids.csv', encoding='utf-8')
