In [3]:
import pandas as pd
import os

In [6]:
def load_csv_as_df(file_name, sub_directories, column_numbers=None, column_names=None):
    '''
    Load any csv as a pandas dataframe. Provide the filename, the subdirectories, and columns to read(if desired).
    '''
    base_path = os.getcwd()
    full_path = base_path + sub_directories + file_name

    if column_numbers is not None:
        df = pd.read_csv(full_path, usecols=column_numbers)
    else:
        df = pd.read_csv(full_path)

    if column_names is not None:
        df.columns = column_names

    return df

In [9]:
def load_data(file_name, sub_directories):
    col_names = ['taxi_id', 'longitude', 'latitude', 'time', 'occupancy_status']

    df = load_csv_as_df(file_name, sub_directories, None, col_names)

    return df

In [10]:
df = load_data('GPS_2016_06_13', '/')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
def filter_data_by_gps(df, with_pass=False):
    # Airport in Shenzhen is 22.627078, 113.804928 and 22.606742, 113.827262.
    # Train Station in Shenzhen is 22.605502, 114.023724 and 22.613580, 114.034568.

    all_taxi_ids = df['taxi_id'].unique()
    print('There are ', len(all_taxi_ids), ' taxi ids in this dataset!')

    near_lat = df[(df['latitude'] >= 22.606742) & (df['latitude'] <= 22.627078)]
    print('There are ', len(near_lat), ' GPS readings near the latitude of the airport')

    near_airport = near_lat[(near_lat['longitude'] >= 113.804928) & (near_lat['longitude'] <= 113.827262)]

    print('There are ', len(near_airport), ' GPS readings near the airport!')
    taxi_ids = near_airport['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' taxi ids near the airport!')

    if with_pass:
        with_pass = near_airport[near_airport['occupancy_status'] == 1]
        print('There are ', len(with_pass), ' GPS readings near the airport with a passenger!')
        with_pass_ids = with_pass['taxi_id'].unique()
        print('There are ', len(with_pass_ids), ' taxi ids near the airport with a passenger!')
        return with_pass
    else:
        return near_airport

In [17]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')

In [18]:
df.dtypes

taxi_id              object
longitude           float64
latitude            float64
time                 object
occupancy_status      int64
dtype: object

In [19]:
df.head()

Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status
0,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:08:51,0
1,S5J3ZlYOh0g=,113.881599,22.5716,2016-06-13 00:08:32,0
2,dLZ1aGYxi9I=,113.919151,22.527117,2016-06-13 00:08:43,1
3,gcWhFSorFRc=,114.092903,22.543301,2016-06-13 00:08:23,0
4,o3Aq6hjujQw=,113.813736,22.62405,2016-06-13 00:08:50,0


In [65]:
near_airport = filter_data_by_gps(df)

There are  22152  taxi ids in this dataset!
There are  5331057  GPS readings near the latitude of the airport
There are  1071875  GPS readings near the airport!
There are  6562  taxi ids near the airport!


In [66]:
def get_taxi_data_near_airport_data(near_airport, full_df):
    taxi_ids = near_airport['taxi_id'].unique()
    
    relevant_taxis = full_df[full_df['taxi_id'].isin(taxi_ids)]
    
    return relevant_taxis

In [67]:
relevant_df = get_taxi_data_near_airport_data(near_airport, df)

In [68]:
print(len(relevant_df))

22911532


In [69]:
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date: pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)

In [70]:
def label_trajectories(df):
    df['time'] = lookup(df['time'])
    updated_dfs = []
    taxi_ids = df['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' in this data')
    empty_route = -1
    trajectory_number = 1

    completed_count = 0

    for taxi_id in taxi_ids:
        # get the df for that taxis
        taxi_df = df.loc[df['taxi_id'] == taxi_id]
        taxi_df.sort_values(by=['time'], inplace=True)
        passenger_got_in = False
        route_numbers = []
        route_starts = []
        route_ends = []

        for index, row in taxi_df.iterrows():
            passenger_in_taxi = row['occupancy_status']

            # Do we already have a passenger?
            if passenger_got_in:
                if passenger_in_taxi:
                    # trajectory still going
                    route_starts.append(False)
                    route_ends.append(False)
                    route_numbers.append(trajectory_number)
                    continue
                elif not passenger_in_taxi:
                    # trajectory ended
                    passenger_got_in = False
                    route_starts.append(False)
                    route_ends.append(True)
                    route_numbers.append(trajectory_number)
                    trajectory_number += 1

            elif passenger_in_taxi:
                passenger_got_in = True
                route_starts.append(True)
                route_ends.append(False)
                route_numbers.append(trajectory_number)

            else:
                route_starts.append(False)
                route_ends.append(False)
                route_numbers.append(empty_route)

        taxi_df['route_number'] = route_numbers
        taxi_df['route_start'] = route_starts
        taxi_df['route_end'] = route_ends
        updated_dfs.append(taxi_df)
        completed_count += 1

        if completed_count % 100 == 0:
            print('Completed ', completed_count, ' taxi_ids out of ', len(taxi_ids))

    return pd.concat(updated_dfs)

In [71]:
labeled = label_trajectories(relevant_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


There are  6562  in this data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Completed  100  taxi_ids out of  6562
Completed  200  taxi_ids out of  6562
Completed  300  taxi_ids out of  6562
Completed  400  taxi_ids out of  6562
Completed  500  taxi_ids out of  6562
Completed  600  taxi_ids out of  6562
Completed  700  taxi_ids out of  6562
Completed  800  taxi_ids out of  6562
Completed  900  taxi_ids out of  6562
Completed  1000  taxi_ids out of  6562
Completed  1100  taxi_ids out of  6562
Completed  1200  taxi_ids out of  6562
Completed  1300  taxi_ids out of  6562
Completed  1400  taxi_ids out of  6562
Completed  1500  taxi_ids out of  6562
Completed  1600  taxi_ids out of  6562
Completed  1700  taxi_ids out of  6562
Completed  1800  taxi_ids out of  6562
Completed  1900  taxi_ids out of  6562
Completed  2000  taxi_ids out of  6562
Completed  2100  taxi_ids out of  6562
Completed  2200  taxi_ids out of  6562
Completed  2300  taxi_ids out of  6562
Completed  2400  taxi_ids out of  6562
Completed  2500  taxi_ids out of  6562
Completed  2600  taxi_ids out of  

In [72]:
labeled.to_csv('labeled-routes-all.csv', encoding='utf-8')

In [73]:
labeled_starts = labeled[labeled['route_start'] == True]
print(len(labeled_starts))

In [36]:
labeled_starts.head()

Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status,route_number,route_start,route_end
891941,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:27:06,1,1,True,False
3376003,c2Zcv7WkrQY=,114.056953,22.5275,2016-06-13 01:23:18,1,2,True,False
3780670,c2Zcv7WkrQY=,114.041817,22.52615,2016-06-13 01:32:05,1,3,True,False
8088309,c2Zcv7WkrQY=,114.158966,22.611967,2016-06-13 03:10:29,1,4,True,False
9805828,c2Zcv7WkrQY=,114.158966,22.611799,2016-06-13 03:49:09,1,5,True,False


In [75]:
labeled_ends = labeled[labeled['route_end'] == True]
print(len(labeled_ends))

In [38]:
labeled_ends.head()

Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status,route_number,route_start,route_end
1726865,c2Zcv7WkrQY=,114.026535,22.56175,2016-06-13 00:45:06,0,1,False,True
3621426,c2Zcv7WkrQY=,114.042213,22.524767,2016-06-13 01:28:32,0,2,False,True
5663430,c2Zcv7WkrQY=,114.139297,22.69265,2016-06-13 02:14:55,0,3,False,True
8630171,c2Zcv7WkrQY=,114.155052,22.652184,2016-06-13 03:22:42,0,4,False,True
10745950,c2Zcv7WkrQY=,114.119186,22.543934,2016-06-13 04:11:09,0,5,False,True


In [77]:
air_starts = filter_data_by_gps(labeled_starts, with_pass=False)
print(len(air_starts))

There are  3656  taxi ids in this dataset!
There are  17272  GPS readings near the latitude of the airport
There are  4716  GPS readings near the airport!
There are  2346  taxi ids near the airport!
4716


In [57]:
good_starts.head(11)

Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status,route_number,route_start,route_end
23647507,c2Zcv7WkrQY=,113.809937,22.626568,2016-06-13 08:52:51,1,9,True,False
30431632,dLZ1aGYxi9I=,113.809799,22.626966,2016-06-13 11:30:02,1,54,True,False
505172,o3Aq6hjujQw=,113.809883,22.626667,2016-06-13 00:19:13,1,86,True,False
980734,IOJgX8YNDQI=,113.809814,22.626984,2016-06-13 00:29:04,1,147,True,False
7063945,OHB5fsrJRqo=,113.806847,22.6236,2016-06-13 02:47:04,1,202,True,False
61011948,OHB5fsrJRqo=,113.825569,22.615067,2016-06-13 22:06:19,1,228,True,False
26889872,7/2ApaP1AeU=,113.809731,22.627001,2016-06-13 10:12:58,1,248,True,False
30541077,4jOr+3IoFCM=,113.8097,22.627016,2016-06-13 11:32:57,1,293,True,False
42434439,4jOr+3IoFCM=,113.809814,22.626833,2016-06-13 15:33:02,1,302,True,False
60392773,iDsgaj4cyhQ=,113.807297,22.624367,2016-06-13 21:49:55,1,386,True,False


In [12]:
def filter_data_by_train_gps(df, with_pass=False):
    # Train Station in Shenzhen is 22.605502, 114.023724 and 22.613580, 114.034568.
    all_taxi_ids = df['taxi_id'].unique()
    print('There are ', len(all_taxi_ids), ' taxi ids in this dataset!')

    near_lat = df[(df['latitude'] >= 22.605502) & (df['latitude'] <= 22.613580)]
    print('There are ', len(near_lat), ' GPS readings near the latitude of the airport')

    near_airport = near_lat[(near_lat['longitude'] >= 114.023724) & (near_lat['longitude'] <= 114.034568)]

    print('There are ', len(near_airport), ' GPS readings near the airport!')
    taxi_ids = near_airport['taxi_id'].unique()
    print('There are ', len(taxi_ids), ' taxi ids near the airport!')

    if with_pass:
        with_pass = near_airport[near_airport['occupancy_status'] == 1]
        print('There are ', len(with_pass), ' GPS readings near the airport with a passenger!')
        with_pass_ids = with_pass['taxi_id'].unique()
        print('There are ', len(with_pass_ids), ' taxi ids near the airport with a passenger!')
        return with_pass
    else:
        return near_airport

In [78]:
train_ends = filter_data_by_train_gps(labeled_ends)
print(len(train_ends))

There are  3589  taxi ids in this dataset!
There are  5847  GPS readings near the latitude of the airport
There are  1691  GPS readings near the airport!
There are  766  taxi ids near the airport!
1691


In [79]:
route_end_numbers = train_ends['route_number'].unique()
route_start_numbers = air_starts['route_number'].unique()

In [80]:
air_to_train = list(set(route_start_numbers) & set(route_end_numbers))
print(len(air_to_train))

11


In [81]:
train_starts = filter_data_by_train_gps(labeled_starts)
air_ends = filter_data_by_gps(labeled_ends, with_pass=False)

print(len(train_starts))
print(len(air_ends))

There are  3656  taxi ids in this dataset!
There are  5885  GPS readings near the latitude of the airport
There are  1438  GPS readings near the airport!
There are  701  taxi ids near the airport!
There are  3589  taxi ids in this dataset!
There are  14641  GPS readings near the latitude of the airport
There are  4854  GPS readings near the airport!
There are  1738  taxi ids near the airport!
1438
4854


In [82]:
air_end_numbers = air_ends['route_number'].unique()
train_start_numbers = train_starts['route_number'].unique()

train_to_air = list(set(air_end_numbers) & set(train_start_numbers))
print(len(train_to_air))

40


In [86]:
with open('airport_to_train_route_numbers.txt', 'w') as f:
    for route_number in air_to_train:
        f.write("%s\n" % route_number)

In [87]:
with open('train_to_airport_route_numbers.txt', 'w') as f:
    for route_number in train_to_air:
        f.write("%s\n" % route_number)

In [108]:
weird = labeled[labeled['route_number'] == 11923]

In [109]:
weird.head()

Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status,route_number,route_start,route_end
55187187,DoTteigbAlw=,114.027298,22.6129,2016-06-13 20:11:57,1,11923,True,False
55214535,DoTteigbAlw=,114.026299,22.610701,2016-06-13 20:12:18,1,11923,False,False
55224090,DoTteigbAlw=,114.023903,22.6092,2016-06-13 20:12:38,1,11923,False,False
55001156,DoTteigbAlw=,113.812798,22.6231,2016-06-13 20:12:46,0,11923,False,True


In [4]:
base_path = os.getcwd()
full_path = base_path + '/' + 'labeled-routes-all.csv'

my_df = pd.read_csv(full_path)

In [5]:
my_df.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,longitude,latitude,time,occupancy_status,route_number,route_start,route_end
0,0,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:08:51,0,-1,False,False
1,15975,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:09:11,0,-1,False,False
2,33494,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:09:31,0,-1,False,False
3,49109,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:09:51,0,-1,False,False
4,66824,c2Zcv7WkrQY=,114.09185,22.541468,2016-06-13 00:10:11,0,-1,False,False


In [6]:
my_df.describe()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,occupancy_status,route_number
count,22911530.0,22911530.0,22911530.0,22911530.0,22911530.0
mean,32828860.0,113.9897,22.58785,0.2700111,19727.21
std,19284420.0,1.28975,0.1707699,0.4439652,40365.08
min,0.0,100.0,2.2628,0.0,-1.0
25%,16018730.0,113.8967,22.53968,0.0,-1.0
50%,32369770.0,114.0225,22.5662,0.0,-1.0
75%,49329440.0,114.0733,22.6194,1.0,13885.0
max,66936420.0,995.02,280.0,1.0,172592.0


In [8]:
labeled_starts = my_df[my_df['route_start'] == True]
print(len(labeled_starts))

labeled_ends = my_df[my_df['route_end'] == True]
print(len(labeled_ends))

173824
172591


In [13]:
air_starts = filter_data_by_gps(labeled_starts, with_pass=False)
print(len(air_starts))

train_ends = filter_data_by_train_gps(labeled_ends)
print(len(train_ends))

There are  3656  taxi ids in this dataset!
There are  17272  GPS readings near the latitude of the airport
There are  4716  GPS readings near the airport!
There are  2346  taxi ids near the airport!
4716
There are  3589  taxi ids in this dataset!
There are  5847  GPS readings near the latitude of the airport
There are  1691  GPS readings near the airport!
There are  766  taxi ids near the airport!
1691


In [14]:
route_end_numbers = train_ends['route_number'].unique()
route_start_numbers = air_starts['route_number'].unique()
air_to_train = list(set(route_start_numbers) & set(route_end_numbers))
print(len(air_to_train))

11


In [15]:
train_starts = filter_data_by_train_gps(labeled_starts)
air_ends = filter_data_by_gps(labeled_ends, with_pass=False)

print(len(train_starts))
print(len(air_ends))

There are  3656  taxi ids in this dataset!
There are  5885  GPS readings near the latitude of the airport
There are  1438  GPS readings near the airport!
There are  701  taxi ids near the airport!
There are  3589  taxi ids in this dataset!
There are  14641  GPS readings near the latitude of the airport
There are  4854  GPS readings near the airport!
There are  1738  taxi ids near the airport!
1438
4854


In [16]:
air_end_numbers = air_ends['route_number'].unique()
train_start_numbers = train_starts['route_number'].unique()

train_to_air = list(set(air_end_numbers) & set(train_start_numbers))
print(len(train_to_air))

40


In [26]:
# print(train_to_air)

In [27]:
# x = my_df[my_df['route_number'] == 64320]
# x.head()
# print(len(x))

In [28]:
# my_df.head()

In [29]:
train_to_air_route_df = my_df[my_df['route_number'].isin(train_to_air)]
train_to_air_route_df = train_to_air_route_df.drop(train_to_air_route_df.columns[0], axis=1)
train_to_air_route_df.to_csv('new-train-to-air-routes-no-grids.csv', encoding='utf-8')

In [30]:
air_to_air_train_df = my_df[my_df['route_number'].isin(air_to_train)]
air_to_air_train_df = air_to_air_train_df.drop(air_to_air_train_df.columns[0], axis=1)
air_to_air_train_df.to_csv('new-air-to-train-routes-no-grids.csv', encoding='utf-8')