# Create WebDataset from im2gps test set images

Recover latitude/longitude from test set image comments, then compile into a webdataset.

In [16]:
import time
import datetime
import json
import subprocess
import base64

from pathlib import Path
from typing import NamedTuple
import xml.etree.ElementTree as ET

import numpy as np
import pandas
import tqdm
import webdataset

DATASET_ROOT = Path.home() / "datasets" / "im2gps3ktest"

def parse_im2gps_filename(path: Path):
    parts = path.stem.split("_")
    return {
        "id": parts[0],
        "secret": parts[1],
        "server": parts[2],
        "owner": parts[3],
    }

In [3]:
"""
XML returned looks like this:

<?xml version='1.0' encoding='UTF-8'?>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>

<rdf:Description rdf:about='/home/fyhuang/datasets/im2gps3ktest/995764493_f8128f77c1_1226_98545448@N00.jpg'
  xmlns:et='http://ns.exiftool.org/1.0/' et:toolkit='Image::ExifTool 12.40'
  xmlns:ExifTool='http://ns.exiftool.org/ExifTool/1.0/'
  xmlns:System='http://ns.exiftool.org/File/System/1.0/'
  xmlns:File='http://ns.exiftool.org/File/1.0/'
  xmlns:JFIF='http://ns.exiftool.org/JFIF/JFIF/1.0/'
  xmlns:Composite='http://ns.exiftool.org/Composite/1.0/'>
 <ExifTool:ExifToolVersion>12.40</ExifTool:ExifToolVersion>
 <System:FileName>995764493_f8128f77c1_1226_98545448@N00.jpg</System:FileName>
 <System:Directory>/home/fyhuang/datasets/im2gps3ktest</System:Directory>
 <System:FileSize>63 KiB</System:FileSize>
 <System:FileModifyDate>2017:09:14 15:51:56-07:00</System:FileModifyDate>
 <System:FileAccessDate>2024:04:08 16:58:14-07:00</System:FileAccessDate>
 <System:FileInodeChangeDate>2024:03:29 15:25:24-07:00</System:FileInodeChangeDate>
 <System:FilePermissions>-rw-------</System:FilePermissions>
 <File:FileType>JPEG</File:FileType>
 <File:FileTypeExtension>jpg</File:FileTypeExtension>
 <File:MIMEType>image/jpeg</File:MIMEType>
 <File:Comment rdf:datatype='http://www.w3.org/2001/XMLSchema#base64Binary'>
cGhvdG86IDk5NTc2NDQ5MyBmODEyOGY3N2MxIDEyMjYAZSBncmVlbiBncmFz
cyBsZWF2ZXMgY2x1YiBkNTAgZ28=
</File:Comment>
 <File:Comment rdf:datatype='http://www.w3.org/2001/XMLSchema#base64Binary'>
b3duZXI6IDk4NTQ1NDQ4QE4wMAAyOGY3N2MxIDEyMjYAZSBncmVl
</File:Comment>
</rdf:Description>
</rdf:RDF>
"""

def decode_comment(comment):
    datatype = comment.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}datatype")
    if datatype is not None and datatype == "http://www.w3.org/2001/XMLSchema#base64Binary":
        decoded = base64.b64decode(comment.text)
        # Only return before the first NULL byte
        return decoded.partition(b"\x00")[0]
    else:
        return comment.text

def decode_tag(comment_text):
    key, sep, value = comment_text.partition(b":")
    assert sep == b":"
    return (key.strip().decode("utf-8"), value.strip().decode("utf-8"))

def parse_image_comment_tags(path: Path):
    # Use exiftool to dump the comment in XML
    xml = subprocess.run(["exiftool", "-a", "-X", str(path)], capture_output=True).stdout
    root = ET.fromstring(xml)
    comments = root[0].findall("./{http://ns.exiftool.org/File/1.0/}Comment")

    tags = {}
    for c in comments:
        comment_text = decode_comment(c)
        key, value = decode_tag(comment_text)
        tags[key] = value
    return tags

test_file = DATASET_ROOT / "995764493_f8128f77c1_1226_98545448@N00.jpg"
parse_image_comment_tags(test_file)

{'photo': '995764493 f8128f77c1 1226',
 'owner': '98545448@N00',
 'title': 'DSC_1513',
 'originalsecret': 'c8d68f26c1',
 'originalformat': 'jpg',
 'datetaken': '2007-07-16 10:18:54',
 'tags': 'uk england tattoo unitedkingdom aircraft air royal airshow international spotting fairford royalinternationalairtattoo2007fairford 61kmneofbristol riat200',
 'license': '0',
 'latitude': '51.682462',
 'longitude': '-1.78633',
 'accuracy': '16',
 'interestingness': '31 out of 393'}

In [15]:
# Construct a dataframe with metadata from all test set images
# Columns:
# - id
# - owner
# - secret
# - server
# - (don't need farm)
# - title
# - (don't need ispublic)
# - (don't need isfriend)
# - (don't need isfamily)
# - (don't need dateupload)
# - latitude
# - longitude
# - accuracy
# - (don't need context)
# - (don't need place_id)
# - (don't need woeid)
# - (don't need geo_is_public)
# - (don't need geo_is_contact)
# - (don't need geo_is_friend)
# - (don't need geo_is_family)
# - (don't need interestingness)
# - (don't need tag)
# - split

columns = {
    "id": [],
    "owner": [],
    "secret": [],
    "server": [],
    "title": [],
    "latitude": [],
    "longitude": [],
    "accuracy": [],
    "split": [],
}

all_images = list(DATASET_ROOT.glob("img/*.jpg"))
for test_image in tqdm.tqdm(all_images):
    fninfo = parse_im2gps_filename(test_image)
    columns["id"].append(fninfo["id"])
    columns["owner"].append(fninfo["owner"])
    columns["secret"].append(fninfo["secret"])
    columns["server"].append(fninfo["server"])

    comment_tags = parse_image_comment_tags(test_image)
    columns["title"].append(comment_tags.get("title", ""))
    columns["latitude"].append(float(comment_tags.get("latitude", "NaN")))
    columns["longitude"].append(float(comment_tags.get("longitude", "NaN")))
    accuracy = comment_tags.get("accuracy")
    if accuracy is not None:
        columns["accuracy"].append(int(accuracy))
    else:
        columns["accuracy"].append(None)
    columns["split"].append("test")

df = pandas.DataFrame(columns)
df

100%|██████████| 3000/3000 [05:18<00:00,  9.41it/s]


Unnamed: 0,id,owner,secret,server,title,latitude,longitude,accuracy,split
0,612790440,82927779@N00,539331ffff,1089,Running for the water,-33.940916,18.374647,16.0,test
1,203372498,40829484@N00,c307077bc6,72,IMG_5299,50.484599,5.890045,9.0,test
2,369070502,13527886@N00,bf3f633dbd,136,img_1883,34.022502,77.603302,9.0,test
3,350755394,51162504@N00,993aec6f8d,156,"Golden Gate Bridge, San Francisco 050",37.807359,-122.469470,15.0,test
4,118511568,41894197861@N01,34404be213,49,DSC_0140,51.499473,-0.119862,14.0,test
...,...,...,...,...,...,...,...,...,...
2995,1107858999,82927779@N00,36eae1fedb,1208,Large school of fish near surface-1,-13.311708,48.115938,16.0,test
2996,488352231,26519935@N00,861d4e0345,229,Lushun_088,38.785535,121.141777,12.0,test
2997,236303536,40829484@N00,53fd5d3d53,95,PICT0176,51.121411,2.622985,11.0,test
2998,513229443,78221228@N00,567342115b,197,"The Eureka tower, Melbourne",-37.814666,144.954986,11.0,test


In [18]:
# Filter out NaNs and duplicates
df.dropna(subset=["latitude", "longitude"], inplace=True)
df.drop_duplicates(subset=["id"], inplace=True)
df.to_pickle(DATASET_ROOT / "im2gps3ktest.pkl")

In [22]:
# Create a webdataset from images and metadata (as json)

def row_filename_stem(row):
    return f"{row.id}_{row.secret}_{row.server}_{row.owner}"
def row_image_path(row):
    return DATASET_ROOT / "img" / (row_filename_stem(row) + ".jpg")

def write_wds_row(row, sink, split):
    if row.split != split:
        return

    img_path = row_image_path(row)
    if not img_path.exists():
        return

    wds_object = {
        "__key__": row_filename_stem(row),
        "jpg": img_path.read_bytes(),
        "json": json.dumps(row._asdict()).encode("utf-8"),
    }
    sink.write(wds_object)

dataset_df = pandas.read_pickle(DATASET_ROOT / "im2gps3ktest.pkl")

with webdataset.ShardWriter(str(DATASET_ROOT / "wds" / "im2gps3ktest_%03d.tar"), encoder=False) as sink:
    for row in tqdm.tqdm(dataset_df.itertuples(), total=len(dataset_df.index)):
        write_wds_row(row, sink, "test")

# writing /home/fyhuang/datasets/im2gps3ktest/wds/im2gps3ktest_000.tar 0 0.0 GB 0


100%|██████████| 2997/2997 [00:05<00:00, 588.90it/s]
