In [13]:
import pandas as pd
import hashlib
import urllib

def save_df_of_redirects(file_ending):
    """
    Save df of redirects based on either of two logs: gif or svg.
    The saved df has three columns: original_image_name, redirect_image_name, redirect_url. 
    The redirect_url is created similarly as done in /scratch/WIT_Dataset/CollectImagesSVG.py.
    """

    if file_ending == 'svg':
        log_file = '/scratch/WIT_Dataset/log_of_redirects.txt'
    elif file_ending == 'gif':
        log_file = '/scratch/WIT_Dataset/log_of_redirects_gif.txt'
    else:
        print(f'ERROR: File ending {file_ending} does not have a log file of redirects under /scratch/.')
        return

    # Read the log file
    with open(log_file, 'r') as file:
        log_lines = file.readlines()

    # Initialize an empty list to store the extracted data
    data = []

    # Iterate over the log lines
    for line in log_lines:
        if line.startswith('WARN: Possible redirect'):
            # Extract the original and redirect links
            parts = line.split(' to ')
            original_link = parts[0].split()[-1]
            redirect_link = parts[1].strip()
            # Get the image hash and get the new url
            image_name = urllib.parse.unquote(redirect_link)
            # Generate an MD5 hash of the image_name
            img_hash = hashlib.md5(image_name.encode('utf-8')).hexdigest()
            # Construct the final image URL using the hash and other components
            path = f'{img_hash[0]}/{img_hash[:2]}/{redirect_link}'
            # Append the extracted data as a dictionary to the list
            data.append({'original_title': urllib.parse.unquote(original_link).encode().decode('unicode_escape'), 
                         'redirect_title': redirect_link, 
                         'redirect_url': path})
    # Create a dataframe from the extracted data
    df_of_redirects = pd.DataFrame(data)
    df_of_redirects = df_of_redirects.drop_duplicates(ignore_index=True)
    print(f'Shape of redirects df {file_ending}: {df_of_redirects.shape}')
    # Save dataframe
    df_of_redirects.to_json(f'data/df_of_redirects_{file_ending}.json.bz2')

In [14]:
save_df_of_redirects('svg')

Shape of redirects df svg: (5650, 3)


In [15]:
save_df_of_redirects('gif')

Shape of redirects df gif: (163, 3)
