In [2]:
%config InlineBackend.figure_format = 'retina'
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai.vision import *
import pandas as pd
import bq_helper
from bq_helper import BigQueryHelper
import re
import torch
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# When That Photo?
Goal of this project is to train a ML network to provide an estimate of when a photo was taken, where the estimate is a range of years (e.g., 1890-1900)

# Setup Access Via Big Query

In [4]:
met = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="the_met")

In [5]:
bq_assistant = BigQueryHelper("bigquery-public-data", "the_met")
bq_assistant.list_tables()

['images', 'objects', 'vision_api_data']

In [101]:
pd.set_option('display.max_rows', 150)
bq_assistant.table_schema('objects')

Unnamed: 0,name,type,mode,description
0,object_number,STRING,NULLABLE,
1,is_highlight,BOOLEAN,NULLABLE,
2,is_public_domain,BOOLEAN,NULLABLE,
3,object_id,INTEGER,NULLABLE,
4,department,STRING,NULLABLE,
5,object_name,STRING,NULLABLE,
6,title,STRING,NULLABLE,
7,culture,STRING,NULLABLE,
8,period,STRING,NULLABLE,
9,dynasty,STRING,NULLABLE,


## Exploratory Queries for Photos

In [102]:
images_url_query = """SELECT 
a.object_id,
a.object_number,
a.title,
a.artist_display_name,
a.object_name,
a.object_date,
a.object_begin_date,
a.object_end_date,
a.medium,
a.dimensions,
a.classification,
b.cropImportanceFraction,
b.cropConfidence,
b.cropBB,
b.cropHintsAnnotation
FROM `bigquery-public-data.the_met.objects` a
JOIN (
  SELECT object_id, cropHintsAnnotation,
  cropHints.importanceFraction as cropImportanceFraction,
  cropHints.confidence as cropConfidence,
  cropHints.boundingPoly.vertices as cropBB
  FROM `bigquery-public-data.the_met.vision_api_data`, 
  UNNEST(cropHintsAnnotation.cropHints) cropHints
) b
ON a.object_id = b.object_id
WHERE a.department = "Photographs" AND
a.classification IN ("Transparencies", "Photographs", "Negatives") OR
a.classification LIKE "Photographs%"
        """
images_url_response = met.query_to_pandas_safe(images_url_query, max_gb_scanned=20)

In [103]:
images_url_response.head(100)

Unnamed: 0,object_id,object_number,title,artist_display_name,object_name,object_date,object_begin_date,object_end_date,medium,dimensions,classification,cropImportanceFraction,cropConfidence,cropBB,cropHintsAnnotation
0,261621,1975.633.1,Jean Baptiste Camille Corot,Étienne Carjat,Photograph,1870,1870,1870,Albumen silver print,,Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 2964}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
1,263236,1981.1229.6.6,Thebes. Médinet-Habou. Partie orientale du Pér...,Maxime Du Camp|Imprimerie photographique de Bl...,Photograph,1850,1850,1850,Salted paper print (Blanquart-Évrard process) ...,,Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 2838}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
2,268216,33.65.403,"Chattanooga, Tennessee. Government Stable",George N. Barnard|Mathew B. Brady,Photograph,ca. 1864,1862,1866,Albumen silver print from glass negative,,Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 1042}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
3,260305,1972.633.91,Miss L.L.L.,Frank Eugene,Photograph,1900s,1900,1909,Platinum print,,Photographs,1.37,1.0,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 1454}, {'y': ...",{'cropHints': [{'importanceFraction': 1.370000...
4,260173,1972.633.162,Anne Köninger and Frederick L. Smith,Frank Eugene,Photograph,1912,1912,1912,Platinum print,,Photographs,0.48,1.0,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 1497}, {'y': ...",{'cropHints': [{'importanceFraction': 0.479999...
5,289262,17.3.3581.1,[Lacock Abbey],William Henry Fox Talbot,Photograph,1839–40,1839,1840,Photogenic drawing from paper negative,Image: 19.6 x 25 cm (7 11/16 x 9 13/16 in.) Sh...,Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 3437}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
6,289196,36.37 (7),Villaggio,William Henry Fox Talbot,Photograph,1839,1839,1839,Photogenic drawing from hand-drawn glass negat...,15.2 x 10.1 cm (6 x 4 in.),Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 2822}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
7,286364,2005.100.982–.988,"[Group of 7 Thoughtographs, or Psychic Photogr...",Charles Lacey,Photographs,1894–98,1894,1898,Albumen and gelatin silver prints,Each approx. 4 x 3,Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 3009}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."
8,282180,1995.96.28,"Abbaye aux Dames et Hospice, Caen",Edmond Bacot,Photograph,1852–54,1852,1854,Salted paper print from glass negative,25.8 x 34.4 cm (10 3/16 x 13 9/16 in. ),Photographs,0.99,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 3932}, {'y': ...",{'cropHints': [{'importanceFraction': 0.989999...
9,669909,2014.715.6,"No. 90. From bluffs at Trempealueau, Wisconsin...",Henry P. Bosse,Photograph,1885,1885,1885,Cyanotype,Sheet: 14 1/2 × 17 3/16 in. (36.8 × 43.7 cm),Photographs,1.0,0.8,"[{'y': 0, 'x': 0}, {'y': 0, 'x': 3898}, {'y': ...","{'cropHints': [{'importanceFraction': 1.0, 'co..."


In [104]:
images_url_response.to_csv(r'/data/kaggle/met/photographs-02-08-19.csv', index=None, sep=',', mode='a')

## Photographs Query
Extract some basic info about the object from the `objects` table, and `JOIN` with the `images` table to get the relevant GCloud Storage pointer. 

Export all a subset of this data to a CSV to be used in a bash script for executing `gsutil cp` to download each image

In [70]:
gcs_url_query = """
SELECT a.object_id, a.object_name, a.title, a.object_date, b.gcs_url
FROM `bigquery-public-data.the_met.objects` a
JOIN (
  SELECT object_id, gcs_url
  FROM `bigquery-public-data.the_met.images`
) b
ON a.object_id = b.object_id
WHERE (a.department = "Photographs" AND
a.classification IN ("Transparencies", "Photographs", "Negatives") OR
a.classification LIKE "Photographs%")
AND ends_with(lower(b.gcs_url), '/0.jpg')
    """
gcs_url_response = met.query_to_pandas_safe(gcs_url_query, max_gb_scanned=50)
gcs_url_response.head(10)

Unnamed: 0,object_id,object_name,title,object_date,gcs_url
0,45195,Photograph,,early 20th century,gs://gcs-public-data--met/45195/0.jpg
1,267891,Panorama,"Ruins of Gallego Flour Mills, Richmond",1865,gs://gcs-public-data--met/267891/0.jpg
2,283176,Photograph,[Hutchinson Family Singers],1845,gs://gcs-public-data--met/283176/0.jpg
3,689997,Panorama,"View from the Sentinel Dome, Yosemite",1865–66,gs://gcs-public-data--met/689997/0.jpg
4,260977,Photograph,LeRoy Beaulieu,ca. 1901,gs://gcs-public-data--met/260977/0.jpg
5,260981,Photograph,Josephine (Portrait of Miss B.),1903,gs://gcs-public-data--met/260981/0.jpg
6,260983,Photograph,Mrs. F. H. Evans,ca. 1901,gs://gcs-public-data--met/260983/0.jpg
7,260984,Photograph,Mrs. F. H. Evans,ca. 1901,gs://gcs-public-data--met/260984/0.jpg
8,260985,Photograph,Mrs. F. H. Evans,ca. 1901,gs://gcs-public-data--met/260985/0.jpg
9,260986,Photograph,Frederick H. Evans,ca. 1901,gs://gcs-public-data--met/260986/0.jpg


In [71]:
len(gcs_url_response)

9748

Grab the `object_id` and `gcs_url` columns to be used by the bash download script

In [73]:
gcs_url_response.loc[:, ['object_id','gcs_url']].to_csv(r'/data/kaggle/met/images.csv', header=None, index=None, sep=',', mode='a')

The bash download script handles the fact that trying to pipe a list of URLs to download into `gsutil cp` ignores the directory structure when downloading to the destination and places everything in the same folder (i.e., if any of the files have the same filename, they will overwrite each other).

## `gsutil` Download Script

```
#!/bin/bash
while IFS=, read -r col1 col2
do
    gsutil -m cp -r $col2 /data/kaggle/met/images/$col1
done < /data/kaggle/met/images.csv
```