In [9]:
from ppi.image_metadata_crawler import LibraryOfCongressCrawler
from ppi.medium_mapper import MediumMapper
from ppi.database import Database
from ppi.image_downloader   import ImageDownloader
import yaml

with open("config.yaml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

database = Database(db_name=config["db_name"])

LIBRARY_OF_CONGRESS_URL_PREFIX = "https://www.loc.gov/pictures/search/?va=exact&q=Cyanotypes.&fa=displayed%3Aanywhere&fi=format&sg=true&op=EQUAL&sp="

## Download images metadata

You use the crawler with a URL for retrieving metadata of all images contained in a page. Metadata is stored in a SQLite DB.

The following Crawler is used in this example:
- [Libary of Congress](https://www.loc.gov/)

Note: the following Crawlers are deprecated (due to underlying page format changes):
- [Getty Search Gateway](https://search.getty.edu/gateway/landing)
- [Cornell University Digital Library](https://digital.library.cornell.edu/)
- [Eastman Museum](https://collections.eastman.org/collections)

You need to implement them yourself if needed.

In [10]:
congress_crawler = LibraryOfCongressCrawler(config=config, database=database)
congress_crawler.save_pages_img_url_metadata(
    prefix_url_search=LIBRARY_OF_CONGRESS_URL_PREFIX, first_page=1, last_page=3
)

[32m2023-10-05 23:05:20.411[0m | [1mINFO    [0m | [36mppi.image_metadata_crawler[0m:[36msave_pages_img_url_metadata[0m:[36m90[0m - [1mProcessing page https://www.loc.gov/pictures/search/?va=exact&q=Cyanotypes.&fa=displayed%3Aanywhere&fi=format&sg=true&op=EQUAL&sp=1[0m
[32m2023-10-05 23:05:22.881[0m | [1mINFO    [0m | [36mppi.image_metadata_crawler[0m:[36msave_pages_img_url_metadata[0m:[36m106[0m - [1mAdded image metadata: https://www.loc.gov/pictures/item/2007662712/[0m
[32m2023-10-05 23:05:24.216[0m | [1mINFO    [0m | [36mppi.image_metadata_crawler[0m:[36msave_pages_img_url_metadata[0m:[36m106[0m - [1mAdded image metadata: https://www.loc.gov/pictures/item/2007664012/[0m
[32m2023-10-05 23:05:25.527[0m | [1mINFO    [0m | [36mppi.image_metadata_crawler[0m:[36msave_pages_img_url_metadata[0m:[36m106[0m - [1mAdded image metadata: https://www.loc.gov/pictures/item/2007664026/[0m
[32m2023-10-05 23:05:26.841[0m | [1mINFO    [0m | [36mppi.im

## Standardize Photographic Processes descriptions

There is code in ``medium_maper.py`` to map the source descriptions to the predefined descriptions in ``config.yaml`` .

In [11]:
mapper = MediumMapper(config=config, database=database)

[32m2023-10-05 23:06:48.415[0m | [1mINFO    [0m | [36mppi.medium_mapper[0m:[36mshow_stats[0m:[36m128[0m - [1mCurrent stats: 
   new_medium  count
0  CYANOTYPE     60[0m


Run the cell below. If results are ok, move to next cell. Otherwise, adjust ```propose_mapping``` method in ```medium_mapper.py```.

In [12]:
mapper.show_undefined_mappings()

[32m2023-10-05 23:06:48.419[0m | [1mINFO    [0m | [36mppi.medium_mapper[0m:[36mshow_undefined_mappings[0m:[36m150[0m - [1mNo undefined medium descriptions.[0m


In [13]:
mapper.update_mediums()  # Updates mediums in DB

## Download images to disk

In [14]:
image_download = ImageDownloader(config=config, database=database)
image_download.download_images(max_number_downloads=10)

[32m2023-10-05 23:06:48.450[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m46[0m - [1mDownloading images for medium ALBUMEN_PRINT[0m
[32m2023-10-05 23:06:48.452[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m72[0m - [1mNo more images to download[0m
[32m2023-10-05 23:06:48.452[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m46[0m - [1mDownloading images for medium AMBROTYPE_TINTYPE_FERROTYPE[0m
[32m2023-10-05 23:06:48.453[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m72[0m - [1mNo more images to download[0m
[32m2023-10-05 23:06:48.454[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m46[0m - [1mDownloading images for medium CARBON_PRINT[0m
[32m2023-10-05 23:06:48.455[0m | [1mINFO    [0m | [36mppi.image_downloader[0m:[36mdownload_images[0m:[36m72[0m - [1mNo more images to d

## Prepare images for Deep Learning

Images must be MANUALLY cropped, as exemplified below (we only want to keep "relevant" information):

<img src = "ppi/images/GettyCrawler_49753.jpg" width="180" height="180">
<img src = "ppi/images/GettyCrawler_49753_crop.jpg" width="180" height="180">
