# Imports

In [1]:
### IMPORTS

import ast
import requests

import os


import pandas as pd
from datetime import datetime as dt
from datetime import timedelta


import cv2
import numpy as np
import matplotlib.pyplot as plt


import warnings

import itertools
from tqdm.notebook import tqdm


# import glob
# import shutil



# Constants

In [2]:
### CONSTANTS

# database locations
DATABASE_PATH_ROOT = '../production/database/'
LINKS_DB_FILENAME = 'links_db.csv'
IMG_PATH_DB_FILENAME = 'img_path_db.csv'
VEHICLE_COUNT_DB_FILENAME = 'vehicle_count_db.csv'

# image download variables
IMG_LINK_PREFIX ='https://images.data.gov.sg/api/traffic-images/'
IMAGES_PATH_ROOT =  '../production/images/'

# image processing variables
YOLO_DNN_WEIGHTS_PATH = "../dnn_model/yolov7.weights"
YOLO_DNN_CFG_PATH = "../dnn_model/yolov7.cfg"
IMAGE_MASK_PATH_ROOT = '../production/image_masks/'
OUTPUT_IMAGES_PATH_ROOT =  '../production/processed_images/'

In [3]:
### LIST OF IMPROVEMENTS
# - use blaze to load up data faster
# - make a bulk download and bulk inference module to catch up

# Links Downloads

In [4]:
### LINKS DOWNLOADS

def call_lta_api(datetime_call):
    '''
    This function calls the LTA traffic images API based on a certain datetime and returns a datafrane row with the time as index and camera_ids as column
    '''

    # getting the api call
    api = 'https://api.data.gov.sg/v1/transport/traffic-images?date_time='+ \
    datetime_call.strftime("%Y-%m-%d") + "T" + datetime_call.strftime("%H") + "%3A" + datetime_call.strftime("%M") + "%3A00"
    
    # reading the camera data from data.gov.sg
    list_of_camera_info = ast.literal_eval(requests.get(api).content.decode("utf-8"))["items"][0]["cameras"]

    # instantiating a dataframe to contain the output
    output = pd.DataFrame()
    
    for item in list_of_camera_info: # iterating through each item in the list
        item_series = pd.Series(item['image'].replace(IMG_LINK_PREFIX,''), # getting the image names and removing the IMG_LINK_PREFIX to save space
                                index=[(pd.to_datetime(item['timestamp']) # setting the index as the timestamp (all series will be concatenated using this index)
                                       .replace(tzinfo=None))], # removing the timezone information
                                name=item['camera_id']) # setting the name/column_name as the camera ID for storage in database
        output = pd.concat([output, item_series],axis=1) # concatenating
    
    # dropping these cameras as they have been shown to have problems with having different polling time compared to the other cams
    output = output.drop(['1001','1002','1003','1004','1005','1006'],axis=1)
    
    # checking if there are any asynchronous camera links (i.e.: cameras link occur at more than one timestamp, resulting in multiple rows/timestamps for one call)
    is_asynchronous = output.isna().sum().sum() > 0
    
    if is_asynchronous:
        output = (output.fillna(method='bfill').fillna(method='ffill'). # fill all rows (timecode) in each column (camera) with the non-null_value in that column
                  sort_index(ascending=False).iloc[[0]]) # then condense the whole dataframe to one row by selecting the latest timecode
    
    # returning the output
    return output



def download_links(datetime_call):
    '''
    This function takes a datetime object and obtains the links from the LTA API based on the datetime
    It will then save the links in the links_db
    The function will first check if the called datetime is already available in the links_db, if so, it will skip the download
    
    
    ### NOTES:
    This function loads up the entire links_db during its function call, a more efficient system would involve a SQL database, which will be implemented in the future
    
    Ideally, this function will only be called sequentially (i.e.: only called once every 5 minutes, and no historical calls), this is to make sure that the links_db is always sorted
    However, for simplicity purposes, the links_db dataframe will be sorted at the end of the function, this is highly inefficient as links_db gets larger
    When deployed using the scheduler, this sorting step will be skipped
    '''
    
    # loads the links database from csv
    links_db_df = pd.read_csv(DATABASE_PATH_ROOT+LINKS_DB_FILENAME,index_col=0)
    links_db_df.index = pd.to_datetime(links_db_df.index) # converting the index to datetime
    
    # checking if timestamp is already available in the links_db
    df_is_empty = links_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call].empty
    
    # IF NOT AVAIL
    if df_is_empty:

        # downloads the list of links from the API and adding it to the end of the dataframe
        new_links_df_row = call_lta_api(datetime_call) # downloads links using the function to generate a new row
        links_db_df = pd.concat([links_db_df,new_links_df_row],axis=0) # adding the row to the bottom of the links_db dataframe

        # sorting the dataframe
        ## WILL BE SKIPPED FOR WHEN USING THE SCHEDULER
        links_db_df = links_db_df.sort_index()

        # saves the dataframe to the links_db.csv
        links_db_df.to_csv(DATABASE_PATH_ROOT+LINKS_DB_FILENAME)
    
    # additionally returns the updated links_db_df (for when this function is called to update the database)
    return links_db_df

# Images Downloads

In [5]:
### IMAGES DOWNLOADS

def get_image_link(cam_id_call,datetime_call):
    '''
    This function takes the camera_id and date_time to be called and outputs the download link and download_path for the image file
    The download link is obtained from the links_db, if no link is present in the links_db (because it's not been downloaded yet), it will attempt to download the links
    
    ### NOTES:
    This function loads up the entire links_db during its function call, a more efficient system would involve a SQL database, which will be implemented in the future
    '''
    # LOADING DATABASE
    # loads the links database from csv
    links_db_df = pd.read_csv(DATABASE_PATH_ROOT+LINKS_DB_FILENAME,index_col=0)
    links_db_df.index = pd.to_datetime(links_db_df.index) # converting the index to datetime
    
    cam_id_call = str(cam_id_call)
    
    # CHECKING (AND DOWNLOADING) LINKS
    # checks the availability of the link
    link_is_empty = links_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,cam_id_call].empty

    # downloads the links if the link is empty
    if link_is_empty:
        links_db_df = download_links(datetime_call=datetime_call)
    
    # GETTING THE IMAGE LINK
    # obtain the image link from the links_db
    img_link = links_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,cam_id_call][0]
    
    # GETTING THE IMAGE FILENAME AND PATH
    # obtain the image timestamp from the links_db
    img_timestamp = links_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,cam_id_call].index[0]
    
    # decide the name of the image file based on the camera_id and time
    img_filename = ("-".join([str(cam_id_call), # get the camera_id
                              img_timestamp.strftime("%Y_%m_%d_%H_%M"), # get the timestamp
                             ]) # combine the cam_id and timestamp with a dash '-'
                    + '.jpg') # add .jpg as filetype
    
    # decide the path of the image file based on the camera_id and time
    img_path = ("/".join([img_timestamp.strftime("%Y_%m_%d"), # get the date
                          str(cam_id_call), # get the camera_id
                             ]) # combine the cam_id and timestamp with a slash '/' (indicating folder structure)
               +'/') # add / for final folder path
    
    return img_link, img_filename, img_path



def download_image_from_link(img_link, img_filename, img_path):
    '''
    This function downloads the image from the data.gov api based on its link and puts it in the proper filepath and filename
    '''
    
    # getting full image link
    img_link = IMG_LINK_PREFIX+img_link
    
    # getting full image download path
    img_path = IMAGES_PATH_ROOT+img_path
    
    # getting the file from the url
    r = requests.get(img_link, allow_redirects=True)
    
    # create folder if it doesn't exist
    os.makedirs(os.path.dirname(img_path), exist_ok=True)

    # combining the path and filename to get the full path
    full_path = img_path + '/' + img_filename 
    
    # writing the file to the path
    with open(full_path, 'wb') as f: 
        f.write(r.content)
        
        
def image_downloader(cam_id_call,datetime_call):
    '''
    This function takes a the cam_id and timestamp and attempts to download the traffic images using links from links_db and save the path to img_path_db
    The function will first check if the called datetime and cam_id is already available in the img_path_db, if so, it will skip the download
    
    
    ### NOTES:
    This function loads up the entire img_path_db during its function call, a more efficient system would involve a SQL database, which will be implemented in the future
    
    Ideally, this function will only be called sequentially (i.e.: only called once every 5 minutes, and no historical calls), this is to make sure that the img_path_db is always sorted
    However, for simplicity purposes, the img_path_db dataframe will be sorted at the end of the function, this is highly inefficient as links_db gets larger
    When deployed using the scheduler, this sorting step will be skipped
    '''
    # LOADING DATABASE
    # loads the img_path database from csv
    img_path_db_df = pd.read_csv(DATABASE_PATH_ROOT+IMG_PATH_DB_FILENAME,index_col=0)
    img_path_db_df.index = pd.to_datetime(img_path_db_df.index) # converting the index to datetime
    
    
    # CHECKING CAMERA ID
    # converts the cam_id_call to a string for indexing
    cam_id_call = str(cam_id_call)
    
    # checks if cam_id_call is part of the available camera
    if cam_id_call not in (img_path_db_df.columns):
        raise Exception("No such camera ID") # throws error if there is no such camera ID
    
    
    # CHECKING IF IMAGE IS ALREADY PRESENT FROM THE img_path_db
    is_img_path_absent = img_path_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,str(cam_id_call)].dropna().empty
    
    # skipping out of this function if the img_path is NOT absent (i.e.: if image is already downloaded)
    if not is_img_path_absent:
        return None
    
    # GETTING THE IMAGE FROM THE API AND SAVING THE PATH TO img_path_db
    # getting the img link, filename, and path from the get_image link function
    img_link, img_filename, img_path = get_image_link(cam_id_call=cam_id_call,datetime_call=datetime_call)
    
    # downloading the image
    download_image_from_link(img_link, img_filename, img_path)
    
    # getting the image timestamp from the filename
    yr,mo,dy,hr,mn = img_filename[5:9], img_filename[10:12], img_filename[13:15], img_filename[16:18], img_filename[19:21] # getting the datetime stamp
    yr,mo,dy,hr,mn = [int(x) for x in [yr,mo,dy,hr,mn]] # converting the datetime stamp to integers
    img_timestamp = dt(yr,mo,dy,hr,mn)
    
    # getting the image full path
    img_full_path = img_path + img_filename
    
    # adding the full path to the img_path_db
    img_path_db_df.loc[img_timestamp,str(cam_id_call)] = img_full_path
    
    # sorting the dataframe
    ## WILL BE SKIPPED FOR WHEN USING THE SCHEDULER
    img_path_db_df = img_path_db_df.sort_index()
    
    # saving the img_path_db
    img_path_db_df.to_csv(DATABASE_PATH_ROOT+IMG_PATH_DB_FILENAME)
    
    # additionally returns the updated img_path_db_df (for when this function is called to update the database)
    return img_path_db_df

# Vehicle Detection

In [6]:
### VEHICLE COUNTER

class VehicleDetector:
    '''
    This class is used to contain the vehicle detector using the pretrained YOLOv7
    Using self.class_allowed, the user can filter which types of objects (or vehicles) that is detected
    '''

    def __init__(self):
        # initialize the class by loading the pre-trained model and setting the allowable classes
        
        # Load DNN from pre-trained model
        net = cv2.dnn.readNet(YOLO_DNN_WEIGHTS_PATH, YOLO_DNN_CFG_PATH)
        
        # setup model and parameters
        self.model = cv2.dnn_DetectionModel(net)
        self.model.setNmsAcrossClasses(True) # setting so that the NMS applies across different classes (if not will simul. detect car and truck)
        self.model.setInputParams(size=(832, 832), scale=1 / 255)

        # Allow classes containing Vehicles only
        self.classes_allowed = [1, 2, 3, 5, 7] # classes are same as COCO class, but SUBTRACT BY ONE, 
        # i.e.: {1:'bicycle', 2:'car',3:'motorcycle', 5:'bus', 7:'truck'}

    def get_bounding_box(self, img):
        '''
        This function takes an image and returns the bounding boxes
        '''
        
        # Create a list to contain all detected instance of vehicles
        vehicle_boxes = []
        
        # detect if a none-type image is loaded (could be because of image error) and returns an error, this will be caught later in the main detection function
        if img is None:
            vehicle_boxes = ['image_error!']
            return vehicle_boxes
        
        # Detect Objects
        class_ids, scores, boxes = self.model.detect(img, 
                                                     nmsThreshold=0.5, # NMS threshold --> higher = more close boxes together
                                                     confThreshold=0.15)
        
        # looping through each object detected
        for class_id, score, box in zip(class_ids, scores, boxes):
            # if the object is within the allowed class, then add the item in the vehicle_boxes list
            if class_id in self.classes_allowed:
                vehicle_boxes.append({'class_id':class_id+1,
                                      'score':score,
                                      'box':box})
                
        return vehicle_boxes
    
    
    def preprocess(self, img, mask_path=None): 
        '''
        This is a helper function to preprocess the image given a mask
        In this particular instance, no further preprocessing was implemented,
        but in theory, sharpening or contrast correction could be added here to help the image detection algorithm
        '''
        # load mask from directory
        if mask_path==None: # if no maks is specified, then generate a white mask (i.e.: everything will pass)
            mask = np.zeros((1080,1920),dtype=np.uint8)
            mask[:] = 255

        else: # if a mask is specified, then use the pre-defined mask
            mask = cv2.imread(mask_path)
            mask = cv2.cvtColor(mask,cv2.COLOR_RGB2GRAY)

        # masking image using the pre-defined mask
        img = cv2.bitwise_or(img,img,mask=mask)

        return img
    
    
    def process_image(self, img, mask_path=None):
        '''
        This function returns the processed image and total vehicle count given an image and a mask_path (used for masking the camera to only the ROI)
        There are various error catching function here which will raise a warning if the function is unable to conduct the preprocessing or detection
        '''
        # INITIALIZATION
        # define vehicle dictionary
        object_dictionary = {2:'bicycle',3:'car',4:'motorcycle',6:'bus',8:'truck'}

        # print error if image failed to load
        if img is None:
            print(f'error in loading image {img_filename}')

        # create a clean copy (without masking or preprocessing) to be outputed later with the bounding boxes
        output_img = img.copy() 

        # PREPROCESSING
        # attempt to preprocess and mask the image
        try:
            img = self.preprocess(img=img,
                                  mask_path=mask_path)
        # if masking fails (due to absence of mask or other things) use the original image
        except:
            img = output_img
            warnings.warn("Warning: Image Preprocessing Error")
            
        # DETECTING VEHICLES
        # use the get_bounding_box function to return the vehicle boxes
        vehicle_boxes = self.get_bounding_box(img)

        # error catching for detection error
        if vehicle_boxes == ['image_error!']:
            warnings.warn("Warning: Image Detection Error")

        # counting number of vehicles
        vehicle_count = len(vehicle_boxes)

        # DRAWING BOUNDING BOXES
        for vehicle_box in vehicle_boxes:
            x, y, w, h = vehicle_box['box']

            cv2.rectangle(output_img, (x, y), (x + w, y + h), (25, 0, 180), 3)
            cv2.putText(output_img, f"{object_dictionary[vehicle_box['class_id']]} | {vehicle_box['score']:.2f}", (x, y + h), 0, 1, (255, 255, 255), 1)

        cv2.putText(output_img, "Vehicle count: " + str(vehicle_count), (20, 50), 0, 2, (100, 200, 0), 3)

        return output_img, vehicle_count

In [7]:
def vehicle_count(cam_id_call, datetime_call):
    '''
    This function takes in the camera_id and datetime and runs a vehicle detection on the corresponding image
    If the image is not yet downloaded, the function will attempt to download the image
    The function will also return
    '''
    # LOADING DATABASE
    # loads the img_path database from csv
    img_path_db_df = pd.read_csv(DATABASE_PATH_ROOT+IMG_PATH_DB_FILENAME,index_col=0)
    img_path_db_df.index = pd.to_datetime(img_path_db_df.index) # converting the index to datetime
    
    # loads the img_path database from csv
    vehicle_count_db_df = pd.read_csv(DATABASE_PATH_ROOT+VEHICLE_COUNT_DB_FILENAME,index_col=0)
    vehicle_count_db_df.index = pd.to_datetime(vehicle_count_db_df.index) # converting the index to datetime
    
    
    # CHECKING CAMERA ID
    # converts the cam_id_call to a string for indexing
    cam_id_call = str(cam_id_call)
    
    # checks if cam_id_call is part of the available camera
    if cam_id_call not in (img_path_db_df.columns):
        raise Exception("No such camera ID") # throws error if there is no such camera ID
    
    
    # CHECKING IF IMAGE HAS BEEN DOWNLOADED FROM THE img_path_db
    is_img_path_absent = img_path_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,str(cam_id_call)].dropna().empty
    
    # downloads the image if the image is absent
    if is_img_path_absent: 
        img_path_db_df = image_downloader(cam_id_call=cam_id_call, datetime_call=datetime_call)
        
        
    # CHECKING IF IMAGE HAS BEEN PROCESSED FROM THE vehicle_count_db
    is_vehicle_count_absent = vehicle_count_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,str(cam_id_call)].dropna().empty
    
    # skips the vehicle count if image has already been processed (i.e.: NOT absent)
    if not is_vehicle_count_absent: 
        return None
    
    
    # GETTING THE IMAGE PATH FROM img_path_db
    img_path = img_path_db_df.loc[datetime_call-timedelta(minutes=5):datetime_call,str(cam_id_call)][0]
    
    # getting the image timestamp from the filename
    img_filename = os.path.basename(img_path)
    yr,mo,dy,hr,mn = img_filename[5:9], img_filename[10:12], img_filename[13:15], img_filename[16:18], img_filename[19:21] # getting the datetime stamp
    yr,mo,dy,hr,mn = [int(x) for x in [yr,mo,dy,hr,mn]] # converting the datetime stamp to integers
    img_timestamp = dt(yr,mo,dy,hr,mn)
    
    # VEHICLE DETECTION
    # Load Veichle Detector class
    vd = VehicleDetector()

    # read the image from path
    img = cv2.imread(IMAGES_PATH_ROOT+img_path)
    
    # obtain mask_path from cam_id
    mask_path = IMAGE_MASK_PATH_ROOT+str(cam_id_call)+'.jpg'
    
    # getting the processed image and vehicle count from the process_image function
    output_img, vehicle_count = vd.process_image(img=img,mask_path=mask_path)
    
    # saving processed image
    img_path_out = OUTPUT_IMAGES_PATH_ROOT + img_path # getting the output image path
    os.makedirs(os.path.dirname(img_path_out), exist_ok=True) # create folder if doesn't exist
    cv2.imwrite(img_path_out, output_img) # writing the image to the output folder
    
    # SAVING VEHICLE_COUNT TO DATABASE
    # adding the vehicle count to the img_path_db
    vehicle_count_db_df.loc[img_timestamp,str(cam_id_call)] = int(vehicle_count)
    
    # sorting the dataframe
    ## WILL BE SKIPPED FOR WHEN USING THE SCHEDULER
    vehicle_count_db_df = vehicle_count_db_df.sort_index()
    
    # saving the img_path_db
    vehicle_count_db_df.to_csv(DATABASE_PATH_ROOT+VEHICLE_COUNT_DB_FILENAME)
    
    
    # # displaying the image (only for debugging)
    # plt.figure(figsize=(20,20))
    # plt.imshow(cv2.cvtColor(output_img,cv2.COLOR_BGR2RGB))

In [8]:
# vehicle_count(cam_id_call=1702, datetime_call=dt(2022,11,12,18,20))

# Catch Up Modules

In [None]:
### CATCH UP SPECIFIC TIME
### SPECIFIC CAMERA

# cam_ids = [1702,2705,2706,3702,3793,3797,4702,4706,4708,4799,5795,6704,6708,6710,6715,7793,7794,7797,8701,8704,9706]
cam_ids = [6708,2706,6710,4702,1702,3793] # selected cameras
cam_id_list = cam_ids


dt_start = dt(2022,11,21,0,0)
dt_end = dt(2022,11,28,0,0)
# dt_end = dt.now()
dt_resolution_mins = 10
num_of_observations = round((dt_end-dt_start)/timedelta(minutes=dt_resolution_mins))
dt_list = [dt_end - timedelta(minutes=x*dt_resolution_mins) for x in range(num_of_observations)]

combo_list = list(itertools.product(cam_id_list,dt_list))
combo_list_pbar = tqdm(combo_list)

for cam_id, dt_call in combo_list_pbar:
    try: vehicle_count(cam_id_call=cam_id, datetime_call=dt_call)
    except: print(f'error in processing {cam_id} at {dt_call.strftime("%Y-%m-%d %H-%M")}')

  0%|          | 0/6048 [00:00<?, ?it/s]

error in processing 6708 at 2022-11-27 10-40
error in processing 6708 at 2022-11-27 01-10
error in processing 6708 at 2022-11-26 14-30
error in processing 6708 at 2022-11-26 08-30
error in processing 6708 at 2022-11-26 01-10
error in processing 6708 at 2022-11-25 01-10
error in processing 6708 at 2022-11-24 06-10
error in processing 6708 at 2022-11-24 01-10
error in processing 6708 at 2022-11-23 18-40
error in processing 6708 at 2022-11-23 01-10
error in processing 6708 at 2022-11-22 01-10
error in processing 6708 at 2022-11-21 18-50
error in processing 6708 at 2022-11-21 11-20
error in processing 6708 at 2022-11-21 01-10
error in processing 2706 at 2022-11-27 10-40
error in processing 2706 at 2022-11-27 01-10
error in processing 2706 at 2022-11-26 08-30
error in processing 2706 at 2022-11-26 01-10


In [26]:
# ### CATCH UP TODAY
# ### ALL CAMERAS


# # cam_ids = [1702,2705,2706,3702,3793,3797,4702,4706,4708,4799,5795,6704,6708,6710,6715,7793,7794,7797,8701,8704,9706]
# cam_ids = [6708,2706,6710,4702,1702,3793] # selected cameras
# cam_id_list = cam_ids


# dt_start = dt.now().replace(hour=17, minute=0,second=0)
# dt_end = dt.now()
# # dt_end = dt.now()
# dt_resolution_mins = 10
# num_of_observations = round((dt_end-dt_start)/timedelta(minutes=dt_resolution_mins))
# dt_list = [dt_end - timedelta(minutes=x*dt_resolution_mins) for x in range(num_of_observations)]

# combo_list = list(itertools.product(cam_id_list,dt_list))
# combo_list_pbar = tqdm(combo_list)

# for cam_id, dt_call in combo_list_pbar:
#     try: vehicle_count(cam_id_call=cam_id, datetime_call=dt_call)
#     except: print(f'error in processing {cam_id} at {dt_call.strftime("%Y-%m-%d %H-%M")}')

  0%|          | 0/30 [00:00<?, ?it/s]

error in processing 6708 at 2022-12-01 17-37
error in processing 6708 at 2022-12-01 17-07
error in processing 2706 at 2022-12-01 17-37
error in processing 2706 at 2022-12-01 17-07
error in processing 6710 at 2022-12-01 17-37
error in processing 6710 at 2022-12-01 17-07
error in processing 4702 at 2022-12-01 17-37
error in processing 4702 at 2022-12-01 17-07
error in processing 1702 at 2022-12-01 17-37
error in processing 1702 at 2022-12-01 17-07
error in processing 3793 at 2022-12-01 17-37
error in processing 3793 at 2022-12-01 17-07


In [10]:
# approx 4 hours of active time per day in amazon sagemaker

# Constant Update Module

In [20]:
# cam_id_list = [6708,2706,6710,4702,1702,3793]
# dt_list = [dt.now()]

# combo_list = list(itertools.product(cam_id_list,dt_list))
# combo_list_pbar = tqdm(combo_list)

# for cam_id, dt_call in combo_list_pbar:
#     try: vehicle_count(cam_id_call=cam_id, datetime_call=dt_call)
#     except: print(f'error in processing {cam_id} at {dt_call.strftime("%Y-%m-%d %H-%M")}')

  0%|          | 0/6 [00:00<?, ?it/s]

# Generate Predictions

In [None]:
def generate_predictions(cam_id, starting_datetime_for_training_week):
    pass

In [25]:
def interpret_predictions(cam_id, week):
    pass

# Generate Traffic Metadata