## <CENTER><U> 1. IMPORTING BASIC NECESSARY PACKAGES</U></CENTER>

In [1]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import time
from geopy.distance import geodesic

## <CENTER><U> 2. READING INPUT DATA </CENTER></U>

In [10]:
data = pd.read_csv("trajectory_data.csv")

In [11]:
data.isnull().sum()

People_Num             0
Time                   0
Travel Start Time      0
Travel End Time        0
Lat                    0
Lon                    0
Alt                    0
Transportation Mode    0
dtype: int64

In [12]:
data.duplicated().sum()

41046

In [13]:
data = data.drop_duplicates(keep='last').reset_index(drop=True)
data.duplicated().sum()

0

In [14]:
display(data.head(10))

Unnamed: 0,People_Num,Time,Travel Start Time,Travel End Time,Lat,Lon,Alt,Transportation Mode
0,104,2008-03-28 08:44:30,2008-03-28 08:42:00,2008-03-28 09:50:00,39.962098,116.301595,0.0,bus
1,104,2008-03-28 08:48:30,2008-03-28 08:42:00,2008-03-28 09:50:00,39.94827,116.303298,0.0,bus
2,104,2008-03-28 08:48:33,2008-03-28 08:42:00,2008-03-28 09:50:00,39.94822,116.303337,0.0,bus
3,104,2008-03-28 08:48:39,2008-03-28 08:42:00,2008-03-28 09:50:00,39.94812,116.303378,0.0,bus
4,104,2008-03-28 08:48:42,2008-03-28 08:42:00,2008-03-28 09:50:00,39.94811,116.303418,0.0,bus
5,104,2008-03-28 08:48:48,2008-03-28 08:42:00,2008-03-28 09:50:00,39.947997,116.30341,0.0,bus
6,104,2008-03-28 08:48:53,2008-03-28 08:42:00,2008-03-28 09:50:00,39.947952,116.303377,0.0,bus
7,104,2008-03-28 08:48:58,2008-03-28 08:42:00,2008-03-28 09:50:00,39.947902,116.303405,0.0,bus
8,104,2008-03-28 08:49:03,2008-03-28 08:42:00,2008-03-28 09:50:00,39.947767,116.303465,0.0,bus
9,104,2008-03-28 08:49:14,2008-03-28 08:42:00,2008-03-28 09:50:00,39.947393,116.303697,0.0,bus


In [17]:
display(data.tail(10))

Unnamed: 0,People_Num,Time,Travel Start Time,Travel End Time,Lat,Lon,Alt,Transportation Mode
2778049,153,2008-05-02 03:20:38,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973062,114.025615,344.5,taxi
2778050,153,2008-05-02 03:20:40,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973048,114.02579,344.5,taxi
2778051,153,2008-05-02 03:20:42,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973038,114.025987,344.5,taxi
2778052,153,2008-05-02 03:20:44,2008-05-02 03:19:30,2008-05-02 03:21:45,32.97305,114.02621,344.5,taxi
2778053,153,2008-05-02 03:20:46,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973093,114.026415,344.5,taxi
2778054,153,2008-05-02 03:20:48,2008-05-02 03:19:30,2008-05-02 03:21:45,32.97313,114.026578,347.8,taxi
2778055,153,2008-05-02 03:20:50,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973163,114.026742,351.0,taxi
2778056,153,2008-05-02 03:20:52,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973187,114.02689,354.3,taxi
2778057,153,2008-05-02 03:20:54,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973197,114.02701,357.6,taxi
2778058,153,2008-05-02 03:20:56,2008-05-02 03:19:30,2008-05-02 03:21:45,32.973193,114.02712,360.9,taxi


In [18]:
data.dtypes

People_Num               int64
Time                    object
Travel Start Time       object
Travel End Time         object
Lat                    float64
Lon                    float64
Alt                    float64
Transportation Mode     object
dtype: object

In [19]:
data.shape

(2778059, 8)

In [20]:
start_time = time.time()

pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)
 
travel_count = 1

data['Travel Count'] = 1

data['Time'] = pd.to_datetime(data['Time'])

count = []

In [21]:
data.dtypes

People_Num                      int64
Time                   datetime64[ns]
Travel Start Time              object
Travel End Time                object
Lat                           float64
Lon                           float64
Alt                           float64
Transportation Mode            object
Travel Count                    int64
dtype: object

## <CENTER><U>3. DATA PRE-PROCESSING 

### Remove records which suggest Different modes during same time period

In [22]:
# For each time point of track data, if it corresponds to multiple modes of transportation, all of them will be deleted

for i in range(len(data)-1):

    if (data['Time'][i + 1] - data['Time'][i]).seconds == 0:

        count.append(i)

        count.append(i+1)

data = data.drop(count).reset_index(drop=True)

count.clear()

In [23]:
data.shape

(2441281, 9)

### Remove abnormal latitude and longitude

In [24]:
# Delete data with abnormal latitude and longitude

for i in range(len(data)):

    if data['Lat'][i] > 90 or data['Lat'][i] < 0 or data['Lon'][i] > 180 or data['Lon'][i] < -180:

        count.append(i)

data = data.drop(count).reset_index(drop=True)

In [25]:
data.shape

(2441281, 9)

### Splitting travel segments

In [26]:
for i in range(len(data)):
    data['Travel Count'].at[i] = travel_count   
    if (i != len(data)-1):        
        if ((data['Travel Start Time'][i] != data['Travel Start Time'][i+1]) or ((data['Time'][i + 1] - data['Time'][i]).seconds > 1800)): 
                
                travel_count = travel_count + 1
 
result = pd.value_counts(data['Travel Count'])

In [27]:
end_time = time.time()
 
print(f'Time taken for processing the data:\t {end_time - start_time}')

Time taken for processing the data:	 277.8717129230499


## <center> <u> 4. ADDITIONAL FEATURE CALCULATION

In [28]:
time_gap = [] 

distance_gap = []

speed = []

acceleration = []

total_time = []

total_time_count = 0

total_distance = []

total_distance_count = 0

In [29]:
i = 0
count = []
for i in range(0,len(data)-1):
    if (data['Travel Count'][i] == data['Travel Count'][i+1])  and ((data['Time'][i+1] - data['Time'][i]).seconds < 50):
        count.append(i+1)
        data['Time'].at[i+1] = data['Time'].at[i]
data = data.drop(count).reset_index(drop=True)

In [30]:
for i in range(len(data)-1):

    if data['Travel Count'][i] == data['Travel Count'][i+1]:
        time_gap.append((data['Time'][i + 1] - data['Time'][i]).seconds)
        distance_gap.append(geodesic((data['Lat'][i], data['Lon'][i]), (data['Lat'][i + 1], data['Lon'][i + 1])).m)
    else:
        time_gap.append('N.A')
        distance_gap.append('N.A')
        
time_gap.append('NA')
distance_gap.append('N.A')

In [31]:
for i in range(len(data)-1):

    if time_gap[i] != 'N.A':

        speed.append(round(distance_gap[i]/time_gap[i], 2))

    else:

        speed.append('N.A')
        
speed.append('N.A')

In [32]:
for i in range(len(data)-1):

    if speed[i] != 'N.A' and speed[i+1] != 'N.A':

        acceleration.append(round(((speed[i+1]-speed[i])/time_gap[i]), 2))

    else:

        acceleration.append('N.A')
 

acceleration.append('N.A')

In [33]:
for i in range(len(data)-1):

    if time_gap[i] != 'N.A':

        total_time_count = total_time_count + time_gap[i]

        total_time.append('N.A')

        total_distance_count = total_distance_count + distance_gap[i]

        total_distance.append('N.A')

    else:

        total_time.append(total_time_count)

        total_distance.append(total_distance_count)

        total_time_count = 0

        total_distance_count = 0
        
total_time.append('N.A')
total_distance.append('N.A')

In [34]:
data['Time Gap(s)'] = time_gap

data['Distance(m)'] = distance_gap

data['Speed(m/s)'] = speed

data['Acceleration(m/s^2)'] = acceleration

data['Total Time(s)'] = total_time

data['Total Distance(m)'] = total_distance

In [35]:
data.to_csv("Data processing/feature_caclculated.csv")