#### Download necessary dependencies to run the notebook.

In [1]:
!pipenv install

[39m[1mInstalling dependencies from Pipfile.lock (f5fbd1)...[39m[22m
  🐍   [32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m[32m[1m▉[39m[22m 0/0 — [30m[22m00:00:00[39m[22m
[0m

In [2]:
import os
from os.path import join
from os.path import join, splitext
import requests
import shutil
import argparse
import tarfile
from pathlib import Path

import pandas as pd
import numpy as np

## Download & extract data for disaster types, damage severity, humanitarian categories, and informativeness

In [3]:
ccid_tarfile_link = "https://crisisnlp.qcri.org/data/crisis_image_datasets_benchmarks/crisis_vision_benchmarks.tar.gz"
ccid_data_tar_filename = ccid_tarfile_link.split('/')[-1]
p = Path(ccid_data_tar_filename)
extensions = "".join(p.suffixes)
ccid_data_name = str(p).replace(extensions, "")
full_ccid_data_dir_path = join('.', ccid_data_name)

In [4]:
print(f'Downloading & Extracting {ccid_data_tar_filename} as directory {full_ccid_data_dir_path} ...')
response = requests.get(ccid_tarfile_link, stream = True)
file = tarfile.open(fileobj = response.raw, mode = "r|gz")
file.extractall(path = '.')
file.close()
print(f'Extracted {ccid_data_tar_filename} as directory {full_ccid_data_dir_path}')

Downloading & Extracting crisis_vision_benchmarks.tar.gz as directory ./crisis_vision_benchmarks ...
Extracted crisis_vision_benchmarks.tar.gz as directory ./crisis_vision_benchmarks


In [5]:
tasks_path = join(full_ccid_data_dir_path, 'tasks')

In [6]:
paths_to_tsvs = {
    'damage_severity': join(tasks_path, 'damage_severity'),
    'humanitarian_categories': join(tasks_path, 'humanitarian'),
    'informativeness': join(tasks_path, 'informative')
}

In [7]:
SPLITS = ['train', 'dev', 'test']
EVENT_NAME_COL_NAME, CLASS_LABEL_COL_NAME = 'event_name', 'class_label'
IMAGE_PATH_COL_NAME = 'image_path'
IMAGE_ID_COL_NAME = 'image_id'
NEW_IMG_PATH_COL_NAME = 'new_path'
OLD_IMG_PATH_COL_NAME = 'old_path'

In [8]:
def construct_labeled_data_folder(parent_path, data_parent_path, task_folder_path, folder_name):
    '''
    Constructs image data folder that is separated into train, dev, and test split folders
    which are then separated into folders for each of the labels in class_labels.
    
            Parameters:
                    parent_path (string): path of the parent directory for where we will create this folder
                    data_parent_path (string): path to the directory containing the folder which contains the source images
                    task_folder_path (string): path of the task folder we would like to extract labeled images from
                    folder_name(string): name of the folder we wish to create with this function

    '''
    task_name = task_folder_path.split('/')[-1]
    
    task_tsvs_path = join(task_folder_path, 'consolidated')
    tsv_paths = os.listdir(task_tsvs_path)
    split_filenames_dict = {split: list(filter(lambda filename: split in filename, tsv_paths))[0] for split in SPLITS}
    split_tsv_paths = {split: join(task_tsvs_path, split_filenames_dict[split]) for split in SPLITS}
    split_dfs = {split: pd.read_csv(split_tsv_paths[split], sep='\t')for split in SPLITS}
    
    class_labels = split_dfs['train'][CLASS_LABEL_COL_NAME].unique()
    folder_path = join(parent_path, folder_name)
    os.mkdir(folder_path)
    # Make train/val/test directories and class labeled directories if they don't already exist
    for split in SPLITS:
        filenames = {} # To help with duplicate image filenames
        split_df = split_dfs[split]
        split_df[OLD_IMG_PATH_COL_NAME] = split_df[IMAGE_PATH_COL_NAME]
        split_df[NEW_IMG_PATH_COL_NAME] = np.nan
        split_df = split_df[[EVENT_NAME_COL_NAME, IMAGE_ID_COL_NAME, OLD_IMG_PATH_COL_NAME, NEW_IMG_PATH_COL_NAME, CLASS_LABEL_COL_NAME]].copy()
        split_path = join(folder_path, split)
        os.mkdir(split_path)
        for label in class_labels:
            label_path = join(split_path, label)
            os.mkdir(label_path)
            label_df = split_df[split_df[CLASS_LABEL_COL_NAME] == label]
            for index, row in label_df.iterrows():
                src_event = row[EVENT_NAME_COL_NAME]
                abs_img_path = row[OLD_IMG_PATH_COL_NAME]
                rel_img_path = join(data_parent_path, abs_img_path)
                image_name = abs_img_path.split('/')[-1]
                if image_name in filenames: # Found duplicate image filename
                    filenames[image_name] += 1
                    filename, ext = splitext(image_name)
                    image_name = filename + '_' + str(filenames[image_name]) + ext
                    filenames[image_name] = 1
                else:
                    filenames[image_name] = 1
                final_img_path = join(label_path, image_name)
                abs_final_img_path = "/".join(final_img_path.split('/')[1:])
                split_df.loc[index] = [src_event, image_name, abs_img_path, abs_final_img_path, row[CLASS_LABEL_COL_NAME]]
                shutil.copy(rel_img_path, final_img_path)
        split_filename = split + '.csv'
        split_df.to_csv(join(folder_path, split_filename), index = False)
        print(f"Number of samples in the {split} set for the {task_name} task: {len(split_df)}")

In [9]:
ccid_splits_folder = join('.', 'CCID Splits Data')
os.mkdir(ccid_splits_folder)

In [11]:
for task_name, task_tsv_path in paths_to_tsvs.items():
    print(f'Creating Train/Dev/Test Image Folders for {task_name} task located at {join(ccid_splits_folder, task_name)}')
    construct_labeled_data_folder(ccid_splits_folder, full_ccid_data_dir_path, task_tsv_path, task_name)
    print(f'Completed creating Train/Dev/Test Image Folders for {task_name} task located at {join(ccid_splits_folder, task_name)}.')
    print()

Creating Train/Dev/Test Image Folders for damage_severity task located at ./CCID Splits Data/damage_severity
Number of samples in the train set for the damage_severity task: 28319
Number of samples in the dev set for the damage_severity task: 2712
Number of samples in the test set for the damage_severity task: 3865
Completed creating Train/Dev/Test Image Folders for damage_severity task located at ./CCID Splits Data/damage_severity.

Creating Train/Dev/Test Image Folders for humanitarian_categories task located at ./CCID Splits Data/humanitarian_categories
Number of samples in the train set for the humanitarian task: 12618
Number of samples in the dev set for the humanitarian task: 1229
Number of samples in the test set for the humanitarian task: 2922
Completed creating Train/Dev/Test Image Folders for humanitarian_categories task located at ./CCID Splits Data/humanitarian_categories.

Creating Train/Dev/Test Image Folders for informativeness task located at ./CCID Splits Data/informat