In [None]:
import os
import math
import sys
import time
import copy
from tqdm import tqdm

import cv2

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import seaborn as sns


In [None]:
"""
- get responsible files for our target time durations (e.g., 7am to 9am)
"""
DATA_COLUMNS_NAME = ['taxiID', 'lng', 'lat', 'alt', 'time', 'azim', 'vel', 'gpstype', 'occ']

DATA_BASEDIR = "/home/gskim/data/CE545 Taxi/data"

TARGET_DATE = "2018-04-02"
TARGET_DURATION = ["070000", "100000"]

taget_date_dir = os.path.join(DATA_BASEDIR, TARGET_DATE)
taget_date_files = os.listdir(taget_date_dir)
taget_date_files.sort()

taget_date_files_in_duration = []
for filename in taget_date_files:
    if TARGET_DURATION[0] < filename[:-4] and filename[:-4] < TARGET_DURATION[1]:
        taget_date_files_in_duration.append(filename)
print(taget_date_files_in_duration)
print("\nThe number of files: ", len(taget_date_files_in_duration))

In [None]:
"""
- merge all (in that time zone) files 
- make data frame per each car 
"""

dfs = []
for filename in taget_date_files_in_duration:
    # read
    df_path = os.path.join(taget_date_dir, filename)
    df = pd.read_csv(df_path)
    df.columns = DATA_COLUMNS_NAME  

    # add
    dfs.append(df)

# merge and make all-in-one dataframe   
df_all = pd.concat(dfs)   

# for memory saving 
del dfs
df_all = df_all.drop("gpstype", axis=1)

In [None]:
# sort by taxi ID
# df_all = df_all.sort_values(["taxiID"], ascending=[True])
# df_all = df_all.reset_index(drop=True)

# grouping by taxi ID
taxi_grouped = df_all.groupby('taxiID')
taxi_grouped.count()

In [None]:
for key in taxi_grouped.groups:
    print(key)

In [None]:
# fig, ax = plt.subplots(figsize=(10, 10))
# ax = sns.lineplot(x="time", y="occ", data=taxi)
# plt.xlabel('passenger on and off graph ', fontsize=15)
# plt.show()


In [None]:
# print(np.count_nonzero(taxi_np_diff == EVENT_ON_TO_OFF), "passengers got off the taxi ID", taxi_id)  
#     print(np.count_nonzero(taxi_np_diff == EVENT_OFF_TO_ON), "passengers took the taxi ID", taxi_id)

EVENT_OFF_TO_ON = -1
EVENT_ON_TO_OFF = 1

taxi_id_dict = taxi_grouped.groups

for for_idx, taxi_id in enumerate(taxi_id_dict):
    taxi = taxi_grouped.get_group(taxi_id)
    taxi = taxi.sort_values(["time"], ascending=[True])
    taxi = taxi.reset_index(drop=True)

    taxi_np = taxi.values
    taxi_np = taxi_np[1:, :]
    taxi_np_occ_diff = taxi_np[1:, -1] - taxi_np[:-1, -1]
    for occ_idx, occ_flag in enumerate(taxi_np_occ_diff):
        if(occ_flag == EVENT_OFF_TO_ON):
#             print("A passenger took the taxi (" + str(taxi_id) + ")", 
#                   "at time", taxi_np[occ_idx, 4],
#                   "and (lng, lat) =", taxi_np[occ_idx, 1], taxi_np[occ_idx, 2])
#             occ_time_in = taxi_np[occ_idx, 4]

        if(occ_flag == EVENT_ON_TO_OFF):
#             print("A passenger got off the taxi (" + str(taxi_id) + ")", 
#                   "at time", taxi_np[occ_idx, 4],
#                   "and (lng, lat) =", taxi_np[occ_idx, 1], taxi_np[occ_idx, 2])
#             occ_time_out = taxi_np[occ_idx, 4]
            
#             minute_diff = float(str(occ_time_out)[-4:-2]) - float(str(occ_time_in)[-4:-2])
#             print(minute_diff, "minuites.\n")


In [None]:
# print(np.count_nonzero(taxi_np_diff == EVENT_ON_TO_OFF), "passengers got off the taxi ID", taxi_id)  
#     print(np.count_nonzero(taxi_np_diff == EVENT_OFF_TO_ON), "passengers took the taxi ID", taxi_id)

EVENT_OFF_TO_ON = 1
EVENT_ON_TO_OFF = -1

taxi_id_dict = taxi_grouped.groups

taxi_on_history_start_flag = 0
taxi_off_history_start_flag = 0
for for_idx, taxi_id in enumerate(tqdm(taxi_id_dict, mininterval=20)):                

    if(for_idx % 10000 == 0):
        print(taxi_on_history.shape)
#         print(taxi_off_history.shape)

    taxi = taxi_grouped.get_group(taxi_id)
    taxi = taxi.sort_values(["time"], ascending=[True])
    taxi = taxi.reset_index(drop=True)

    taxi_np = taxi.values
    taxi_np = taxi_np[1:, :]
    taxi_np_occ_diff = taxi_np[:-1, -1] - taxi_np[1:, -1] # EVENT_OFF_TO_ON = 1, EVENT_ON_TO_OFF = -1
    for occ_idx, occ_flag in enumerate(taxi_np_occ_diff.tolist()):

        if(occ_flag == EVENT_OFF_TO_ON):
            event = taxi_np[occ_idx, :]
            event = np.expand_dims(event, axis=0)
            if(taxi_on_history_start_flag == 0):
                taxi_on_history = event
                taxi_on_history_start_flag = 1
            else:
                taxi_on_history = np.concatenate((taxi_on_history, event), axis=0)
                                
        if(occ_flag == EVENT_ON_TO_OFF):
            event = taxi_np[occ_idx, :]
            event = np.expand_dims(event, axis=0)
            if(taxi_off_history_start_flag == 0):
                taxi_off_history = event
                taxi_off_history_start_flag = 1
            else:
                taxi_off_history = np.concatenate((taxi_off_history, event), axis=0)
                


In [None]:
taxi_on_history_ = taxi_on_history[:, :-1]
taxi_off_history_ = taxi_off_history[:, :-1]

print(taxi_on_history.shape)
print(taxi_off_history.shape)

np.savetxt("taxi_on_20180402_070000_100000.csv", taxi_on_history_, delimiter=",")
np.savetxt("taxi_off_20180402_070000_100000.csv", taxi_off_history_, delimiter=",")
