## Get dataframe with images and labels

In [4]:
import pandas as pd
import urllib

In [3]:
# Adapted from https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
def normalize_title(title, dumps=True):
    """ Replace _ with space, remove anchor and namespace prefix, capitalize """
    title = urllib.parse.unquote(title)
    if(dumps):
        try:
            title = title.split(':', 1)[1]
        # Currently happens only for broken cross-namespace redirects
        except IndexError:
            return ''
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if '#' in n_title:
        n_title = n_title.split('#')[0]
    return n_title

In [13]:
CATEGORIES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-category-network.parquet'
FILES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-files.parquet'

# To get a label for category 'Comedy films of the United States'
import sys
sys.path.append('../taxonomy')
from queryLabel import Taxonomy

taxonomy = Taxonomy()
taxonomy.load_categories(CATEGORIES_PATH)
taxonomy.set_taxonomy(mapping='content_extended')
taxonomy.get_label('Comedy films of the United States', how='naive')

# To load the files dataframe
files = pd.read_parquet(FILES_PATH)
files.head()

Unnamed: 0,id,title,categories
0,80902489,"""A Gathering of Court Women"", Folio from the D...","[Department of Islamic Art, Metropolitan Museu..."
1,53631642,"""Akdeniz"" - Aliağa, 2016.jpg","[Akdeniz (ship, 1955), Aliağa ship-breaking yard]"
2,74785910,"""Carnevale di Torino"". Felice Cerruti - f.lli ...","[Felice Cerruti Bauduc, Victor Emmanuel II of ..."
3,14890941,"""Escena de playa con figura"".jpg","[Pedro Lira, People with dogs in art]"
4,17828296,"""Florero con plátanos, limones y libros"", Juan...","[Works by Juan de Echevarría, Still-life paint..."


In [15]:
segment_nr = 0
segment_frame = pd.read_csv('/scratch/WIT_Dataset/wit_v1.train.all-0000' + str(segment_nr) + '-of-00010.tsv.gz', compression='gzip', sep='\t')
segment_frame_en = segment_frame[segment_frame.language == 'en']
print(segment_frame_en.shape)
segment_frame_en.head()

(542593, 17)


Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,original_height,original_width,is_main_image,attribution_passes_lang_id,page_changed_recently,context_page_description,context_section_description
0,en,https://en.wikipedia.org/wiki/Oxydactylus,https://upload.wikimedia.org/wikipedia/commons...,Oxydactylus,,Oxydactylus,,English: Mounted skeleton of Oxydactylus longi...,,image/jpeg,3564,2748,True,True,False,Oxydactylus is an extinct genus of camelid end...,Oxydactylus is an extinct genus of camelid end...
5,en,https://en.wikipedia.org/wiki/Maine%27s_3rd_co...,https://upload.wikimedia.org/wikipedia/commons...,Maine's 3rd congressional district,List of members representing the district,Maine's 3rd congressional district / List of m...,,"English: Samuel W. Gould, US Representative fr...",,image/jpeg,1870,1421,False,True,False,Maine's 3rd congressional district is an obsol...,
7,en,https://en.wikipedia.org/wiki/Cheraw,https://upload.wikimedia.org/wikipedia/commons...,Cheraw,18th century,Cheraw / History / 18th century,A c. 1724 English copy of a deerskin Catawba m...,"English: ""Map of the Several Nations of Indian...",,image/jpeg,1217,1770,False,True,False,"The Cheraw people, also known as the Saraw or ...","In 1710, due to attacks by the Seneca of the I..."
13,en,https://en.wikipedia.org/wiki/Romanian_Front,https://upload.wikimedia.org/wikipedia/commons...,Romanian Front,Stagnation,Romanian Front / History / Stagnation,"Nameplate of Gazeta Transilvaniei on June 14, ...",English: Nameplate of the Romanian nationalist...,,image/png,306,960,False,True,True,The Romanian Front was a moderate fascist part...,A reshuffled Tătărescu government took over in...
18,en,https://en.wikipedia.org/wiki/%C3%81d%C3%A1m_K...,https://upload.wikimedia.org/wikipedia/commons...,Ádám Kósa,,Ádám Kósa,,English: Hungarian MEP Ádám Kósa,,image/jpeg,4928,3264,True,False,True,Ádám Kósa is a Hungarian politician and Member...,Ádám Kósa (born 1 July 1975) is a Hungarian po...


In [18]:
image_labels = segment_frame_en.merge(files, left_on=['page_title'], right_on=['title'])
print(image_labels.shape)
image_labels.head()