### Load, split and save train and test dataframes


In [1]:
import pandas as pd
import urllib.parse
import os
from sklearn.model_selection import train_test_split
import numpy as np
from matplotlib import pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from help_functions import get_y_true

In [2]:
MINIMAL_NR_IMAGES = 1_000

Load naive labels, and pre-process urls.

In [38]:
# INPUT FILES
HEURISTICS_LABELS_PATH = 'data/commonswiki-20220914-files-heuristic-labels.json.bz2'
NAIVE_LABELS_PATH = 'data/commonswiki-20220220-files-naive-labels.json.bz2'

# OUTPUT FILES
SPLIT_DATA_PATH = 'data/split_dataframes_heuristic_labels_20220914' #/train_df.json.bz2 and /test_df.json.bz2 

In [5]:
labels = pd.read_json(HEURISTICS_LABELS_PATH)

# With this encoded url, only 190k images aren't found, while with "url" 790k aren't found
labels['url'] = labels['url'].apply(lambda encoded_filename : urllib.parse.unquote(encoded_filename).encode().decode('unicode-escape'))
labels['can_be_opened'] = labels['url'].apply(lambda url : os.path.isfile('/scratch/WIT_Dataset/images/' + url))
print(f'Total number of files: {labels.shape[0]}.')

labels = labels.loc[labels.can_be_opened == True].reset_index(drop=True)

# labels.rename(columns={'new_labels': 'labels'}, inplace=True)
print(f'Total number of files that can be opened: {labels.shape[0]}.')
print(labels.shape)
labels.head()

Total number of files: 3891446.
Total number of files that can be opened: 3516309.
(3516309, 6)


Unnamed: 0,id,title,url,categories,labels,can_be_opened
0,53631642,"""Akdeniz"" - Aliağa, 2016.jpg","f/f8/""Akdeniz""_-_AliaÄa,_2016.jpg","[Akdeniz (ship, 1955), Aliağa ship-breaking yard]",[],True
1,74785910,"""Carnevale di Torino"". Felice Cerruti - f.lli ...","1/13/""Carnevale_di_Torino""._Felice_Cerruti_-_f...","[Felice Cerruti Bauduc, Victor Emmanuel II of ...","[Places, Art, People, Culture]",True
2,14890941,"""Escena de playa con figura"".jpg","c/c2/""Escena_de_playa_con_figura"".jpg","[Pedro Lira, People with dogs in art]",[People],True
3,17828296,"""Florero con plátanos, limones y libros"", Juan...","d/da/""Florero_con_plÃ¡tanos,_limones_y_libros""...","[Works by Juan de Echevarría, Still-life paint...","[Art, Culture]",True
4,33628123,"""Keep FDA Off The Farm"" (FDA 140) (8205957369)...","b/bc/""Keep_FDA_Off_The_Farm""_(FDA_140)_(820595...","[Food and Drug Administration (United States),...","[Nature, Culture]",True


In [7]:
HEURISTICS_LABELS_CAN_OPEN_PATH = 'data/commonswiki-20220914-files-heuristic-labels-can-be-opened.json.bz2'
labels.to_json(HEURISTICS_LABELS_CAN_OPEN_PATH)

#### Extra cleaning 

When running `model.evaluate()`, it fails with the `PIL.UnidentifiedImageError` exception error. It's mostly images from format .webp, but there are also some .png images. Remove them from the dataset. (Example: `!ls /scratch/WIT_Dataset/images/d/d7/World_Scout_Jamboree_-_Punch_cartoon_-_Project_Gutenberg_eText_16628.png` 

`PIL.Image.open('/scratch/WIT_Dataset/images/d/d7/World_Scout_Jamboree_-_Punch_cartoon_-_Project_Gutenberg_eText_16628.png')`
)

In [32]:
# import PIL
# from pathlib import Path
# from PIL import Image, UnidentifiedImageError
# from tqdm import tqdm
# for index, row in tqdm(labels.iterrows(), total=labels.shape[0]):
#     try:
#         img = Image.open('/scratch/WIT_Dataset/images/' + row.url)
#     except PIL.UnidentifiedImageError:
#         print(row.url)
#         labels.at[index, 'can_be_opened'] = False
#     except Exception as e:
#         print(e)
#         labels.at[index, 'can_be_opened'] = False

# Work around to avoid 11 hours runtime: use dataset whose images have already been asserted
naive_labels = pd.read_json('data/commonswiki-20220601-files-naive-labels.json.bz2')
openable_labels = labels.merge(naive_labels, on=['id', 'title', 'url'])
openable_labels.drop(labels=['can_be_opened_x', 'can_be_opened_y'], axis=1, inplace=True)
# The categories are the same, assert that, and remove one of them
assert (openable_labels.categories_x == openable_labels.categories_y).sum() == len(openable_labels)
openable_labels.drop(labels='categories_y', axis=1, inplace=True)
openable_labels.rename({'categories_x': 'categories', 
                        'labels_x': 'labels', 
                        'labels_y': 'naive_labels'}, axis=1, inplace=True)

In [33]:
print(openable_labels.shape)
openable_labels.head()

(1817902, 6)


Unnamed: 0,id,title,url,categories,heuristic_labels,naive_labels
0,74785910,"""Carnevale di Torino"". Felice Cerruti - f.lli ...","1/13/""Carnevale_di_Torino""._Felice_Cerruti_-_f...","[Felice Cerruti Bauduc, Victor Emmanuel II of ...","[Places, Art, People, Culture]","[Places, Society]"
1,14890941,"""Escena de playa con figura"".jpg","c/c2/""Escena_de_playa_con_figura"".jpg","[Pedro Lira, People with dogs in art]",[People],"[Culture, Entertainment]"
2,17073242,"""Mary Holiday"" - NARA - 559051.jpg","0/02/""Mary_Holiday""_-_NARA_-_559051.jpg","[Art of the United States, Harmon Foundation, ...","[People, Society]","[Culture, History, Society]"
3,5259480,"""The End of the 'Forty Five' Rebellion"".jpg","c/c7/""The_End_of_the_'Forty_Five'_Rebellion"".jpg","[Battle of Culloden, 1882 engravings, Wounded ...","[History, People, Culture, Society]","[Culture, History]"
4,67898619,"""Woman are too sentimental for jury duty"" -Ant...","f/f0/""Woman_are_too_sentimental_for_jury_duty""...","[Pro-suffrage cartoons, Kenneth R. Chamberlain...","[Politics, Culture, Society]","[Culture, Politics, Society]"


Only keep .jpg files

In [35]:
# print(f"{naive_labels.loc[naive_labels.url.str.contains('.jpeg', case=False)].shape[0]} images of type .jpeg")
labels_jpg = openable_labels.loc[openable_labels.url.str.contains('.jpg', case=False)]
print(f'{labels_jpg.shape[0]} images of type .jpg')

1596269 images of type .jpg


Split dataset and save so the same training and test sets are used throughout.

In [37]:
# Sample images
samples = labels_jpg.sample(n=800_000, random_state=7)

In [39]:
train_df, test_df = train_test_split(samples, test_size=0.05, random_state=7)

print(f'Train set: {train_df.shape[0]} images ({train_df.shape[0] / samples.shape[0]:.2f}%)')
print(f'Test set:  {test_df.shape[0]}  images ({test_df.shape[0] / samples.shape[0]:.2f}%)')

os.mkdir(SPLIT_DATA_PATH)

train_df.to_json(f'{SPLIT_DATA_PATH}/train_df.json.bz2', compression='bz2')
test_df.to_json(f'{SPLIT_DATA_PATH}/test_df.json.bz2', compression='bz2')

Train set: 760000 images (0.95%)
Test set:  40000  images (0.05%)
