# Process Overview for Downloading Traffic Images

As previously mentioned, the data.gov.sg API only returns the link to the traffic images (which is stored in a static server). Therefore, the next process would be to use the previously scraped images link and download the traffic images.

The following are several example of the traffic images:
![image](../images/notebook_images/traffic_images_samples.jpg)

In this notebook, we will be downloading the traffic images from selected cameras (22) for the entire month of October, as well as the first two weeks of November. The cameras are selected based on varying locations, spanning all across Singapore.

The total number of images downloaded for this study is 195,093 images.

# Imports

In [1]:
import datetime
import pytz
import requests
import ast
import numpy as np
import urllib 
import shutil
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob

from tqdm import tqdm

# Reading links dataframe from storage

We will first load the image links from the .csv file obtained in the previous notebook

In [2]:
df = pd.read_csv('../data/LTA_traffic_cam_20221001-20221031.csv', parse_dates=['timestamp'])
df['timestamp'] = df['timestamp'].dt.tz_localize(None) # remove timezone info
df

Unnamed: 0,timestamp,image,camera_id,md5
0,2022-09-30 23:57:15,2022/09/ccfae87c-d330-4ccd-a1e7-a0c94062f94b.jpg,1001,6b7cf877b837a2c584d69cc989e2791c
1,2022-09-30 23:57:15,2022/09/b95185f7-69a8-4fbf-9ed9-69838b4eb101.jpg,1002,fce5e26fa7899b7ecb7757b2998baaae
2,2022-09-30 23:57:15,2022/09/22786a35-99ed-468a-9562-968cb64aecd0.jpg,1003,0e4cca557d25c51459d866ce51f958bb
3,2022-09-30 23:57:15,2022/09/c084a08b-c7c8-4eed-8adb-e982792cfdbe.jpg,1004,d4e4ea3ca7060440d154f75b33c74b6c
4,2022-09-30 23:57:15,2022/09/86850290-63ec-4b24-819a-5c8cd57497d7.jpg,1005,8e381cc6bb4eb77d7c2028796ff5efbf
...,...,...,...,...
771910,2022-10-31 23:52:25,2022/10/72b7c3a8-57d2-4895-83f6-da4bb3a16ff4.jpg,9702,eacdf5702f2e4d7ee1ef682d40a0bd92
771911,2022-10-31 23:52:25,2022/10/7017e4ee-4373-48e5-b6e5-5c470342b5a8.jpg,9703,ba47194f20c29edfe5d3fdde711710b7
771912,2022-10-31 23:52:25,2022/10/688edd73-682f-47e1-b868-17cf21e78a4b.jpg,9704,0c151fbd8dd501e408de598df9f3ec85
771913,2022-10-31 23:52:25,2022/10/563bf0b5-431d-4274-a6ab-61c4ad7fdb61.jpg,9705,2bd974a5cacbe0bdb12182362faeb5eb


# Defining Function to Download Images

First, we will need to define a couple of functions to download the images from the images.data.gov.sg static server.

In [3]:
def download_item(url, filename, path):
    '''
    This function takes in the URL to download the traffic images from, as well as the file name and path for the images to be saved into
    The function will then download the image based on the URL and save it with the filename into the defined path 
    '''  
    # getting the file from the url
    r = requests.get(url, allow_redirects=True)
    
    # create the folder path if it doesn't exist yet
    os.makedirs(os.path.dirname(path), exist_ok=True)

    # combining the path and filename to get the full path
    full_path = path + '\\' + filename 
    
    # writing the file to the path
    with open(full_path, 'wb') as f: 
        f.write(r.content)

In [4]:
def download_from_dataframe(df,download_path,failed_download_list=None):
    '''
    This function takes in a dataframe containing the links and metadata of the traffic images and then attempts to download all the images whose link is in that dataframe
    You will also need to specify the path for the images to be downloaded to
    This function will also generate the filename for the images based on the image metadata from the dataframe
    If any of the download fails (usually due to timeout from the download server), you can specify a failed download list which you can use to retry the downloads at a later time
    '''
    
    # converting the df into list of dictionaries
    df_rows_as_dict = df.to_dict(orient="records")
    
    # convert the list of df_rows into a tqdm for displaying progress bar
    df_rows_as_dict_pbar = tqdm(df_rows_as_dict)
    
    # create download folder if doesn't exist yet
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    
    # iterating to each row in the dataframe
    for df_row in df_rows_as_dict_pbar:
        
        # get the image filename (for saving the image) from row info and join together into a string
        filename_from_row = ("-".join([str(df_row['camera_id']), # get the camera_id
                                      pd.to_datetime(df_row['timestamp']).strftime("%Y_%m_%d_%H_%M"), # get the timestamp
                                      df_row['md5']]) #include the md5 hash
                            + '.jpg') # add .jpg as filename
            
        # get the full download url from row info
        image_link_prefix='https://images.data.gov.sg/api/traffic-images/'
        url_from_row = image_link_prefix + df_row['image']
        
        # printing out current datetime_call in the tqdm
        df_rows_as_dict_pbar.set_description(f'Downloading {df_row["image"]}')

        # attempting to download the image
        try:
            download_item(url = url_from_row,
                          filename = filename_from_row,
                          path = download_path)
        
        # error catching if download fails
        except:
            # notify the user about failed download
            print(f'failed to download {url_from_row}')
            
            # if a failed_download_list has been specified by the user, append the failed download info the the list
            if failed_download_list!=None:
                
                # creating a dictionary of the failed item download
                failed_download_dict = {'url':url_from_row,
                                        'filename':filename_from_row,
                                        'path':download_path}
                # appending the dictionary to the list
                failed_download_list.append(failed_download_dict)
                
                
                
def download_from_failed_download_list(download_list):
    '''
    This function will download the items from the failed_download_list
    '''
    # convert the list of df_rows into a tqdm for displaying progress bar
    download_list_pbar = tqdm(download_list)
    
    # iterating through each item in the downloads_list
    for dictionary in download_list_pbar:
        # downloading the items
        download_item(url = dictionary['url'],
                      filename = dictionary['filename'],
                      path = dictionary['path'])

# Filtering df to be downloaded

We then need to filter the dataframe of links to be downloaded, filtering them by the camera ID as well as the start and ending datetimes

In [5]:
def filter_df_to_download(df, camera_id_list, start_datetime, end_datetime):
    '''
    This function is used to filter the pandas dataframe based on the camera ID list and the start and end date time
    '''
    # masking based on camera_id
    camera_mask = df['camera_id'].isin(camera_id_list) # only select camera_id that's in the cam_id_list
    df = df[camera_mask]
    
    # masking based on timestamp
    date_mask = (df['timestamp'] >= start_datetime) & (df['timestamp'] <= end_datetime) # only select records that's between the start and end datetime_call
    df = df[date_mask]
    
    df = df.sort_values(['camera_id','timestamp'])
    
    print(f'No of images to be downloaded: {df.shape[0]}')
       
    return df

In [15]:
# list of camera_ids to download
camera_id_list = [1702,2705,2706,3702,3793,3797,4702,4706,4708,4799,5795,6704,6708,6710,6714,6715,7793,7794,7797,8701,8704,9706]

# defining start and end datetime
start_datetime=datetime.datetime(2022,10,1,0,0,0)
end_datetime=datetime.datetime(2022,11,15,0,0,0)

In [16]:
df_download = filter_df_to_download(df=df,
                                    camera_id_list=camera_id_list, 
                                    start_datetime=start_datetime, 
                                    end_datetime=end_datetime,
                                   )

No of images to be downloaded: 195093


# Downloading Images

In [8]:
failed_download_list = []
download_path = 'C:/image_downloads_test/'

download_from_dataframe(df=df_download,
                        download_path=download_path,
                        failed_download_list=failed_download_list,
                       )

Downloading 2022/09/f36de43b-ba76-4c37-a9fb-c322c0431b1d.jpg: 100%|████████████████████| 44/44 [00:08<00:00,  5.26it/s]


## Checking for Failed Downloads

In [9]:
# checking the number of failed downloads
print(f'Number of failed downloads: {len(failed_download_list)}')

# re-attempting the download of the failed downloads items
download_from_failed_download_list(download_list=failed_download_list)

Number of failed downloads: 0


0it [00:00, ?it/s]


# Arranging Downloaded Images Folder

Finally we will arrange the images in the folder according to the camera_id and image capture time

In [17]:
def folder_arranger(folder_path):
    '''
    This function takes in a folder path and will arrange the traffic images in that folder according to the camera_id and image capture time
    The info regarding the camera_id and image capture time is extracted from the filename of the image
    '''
    
    # getting a list of all the .jpg images in the folder_path
    image_paths = glob.glob(pathname=folder_path+"/**/*.jpg",
                            recursive=True)

    # convert the image_paths list into a tqdm for displaying progress bar
    image_paths_pbar = tqdm(image_paths)

    # iterating through each image in the image_paths list
    for image_path in image_paths_pbar:

        # get filename
        image_filename = os.path.basename(image_path)

        # adding description
        image_paths_pbar.set_description(f'Arranging folder')
        
        # get camera_id and date from filename
        camera_id = image_filename[0:4]
        date = image_filename[5:15]

        # assigngs the folder name based onthe camera_id and date
        folder_path_to_put_into = folder_path + camera_id + '/' + date + '/'

        # create folder if doesn't exist yet
        if not os.path.exists(folder_path_to_put_into):
            os.makedirs(folder_path_to_put_into)
            
        # moves the image from image_path to the new folder
        shutil.move(src=image_path,
                    dst=folder_path_to_put_into+image_filename)

In [11]:
folder_arranger(folder_path=download_path)

Arranging folder: 100%|███████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 365.91it/s]
