In [2]:
import logging
import os
from io import BytesIO

import cv2
import numpy as np
from joblib import Parallel, delayed
from matplotlib import pyplot as plt
import pandas as pd
from PIL import Image
import requests
from tqdm.auto import tqdm
import urllib3

# Load data

In [9]:
df = pd.read_csv('output/cdli_catalogue_data.csv')

In [None]:
df = df[df.era.notna()].copy()
df = df.sample(df.shape[0], random_state=0)

# Look at the dataset - Minor EDA

In [3]:
df

Unnamed: 0,genre,id,id_text,period,photo_up,subgenre,period_normed,era,provenience_normed,height,width,thickness,year_range,provenience
69980,,230251,395520,Neo-Assyrian (ca. 911-612 BC),600ppi 20160630,,Neo-Assyrian,iron,Nineveh,,,,911-612 BC,Nineveh (mod. Kuyunjik)
8270,Administrative,22387,113693,Ur III (ca. 2100-2000 BC),600ppi 20160630,,Ur III,early_bronze,Puzris-Dagan,?,?,?,2100-2000 BC,Puzriš-Dagan (mod. Drehem)
75496,,235802,401195,Neo-Assyrian (ca. 911-612 BC),600ppi 20160630,,Neo-Assyrian,iron,Nineveh,,,,911-612 BC,Nineveh (mod. Kuyunjik)
63859,,208773,373215,Neo-Babylonian (ca. 626-539 BC),600ppi 20160630,,Neo-Babylonian,iron,Uruk,,,,626-539 BC,Uruk (mod. Warka)
36855,Administrative,104985,255428,Middle Babylonian (ca. 1400-1100 BC),600ppi 20160630,,Middle Babylonian,mid_late_bronze,Nippur,?,?,?,1400-1100 BC,Nippur (mod. Nuffar)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21268,Administrative,66459,215261,Old Akkadian (ca. 2340-2200 BC),600ppi 20160913,,Old Akkadian,early_bronze,Umma,?,?,?,2340-2200 BC,Umma (mod. Tell Jokha)
46062,Royal/Monumental,117042,268908,Early Old Babylonian (ca. 2000-1900 BC),600ppi 20160913,witness,Early Old Babylonian,mid_late_bronze,Nippur,?,?,?,2000-1900 BC,Nippur (mod. Nuffar)
42784,Literary,112039,262869,Old Babylonian (ca. 1900-1600 BC),600ppi 20160630,ETCSL 4.80.02 Kesh Temple Hymn ('Decad no. 06'...,Old Babylonian,mid_late_bronze,Nippur,?,?,?,1900-1600 BC,Nippur (mod. Nuffar)
43738,Royal/Monumental,113774,264941,ED IIIb (ca. 2500-2340 BC),600ppi 20160913,witness,ED IIIb,early_bronze,Nippur,?,?,?,2500-2340 BC,Nippur (mod. Nuffar)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97123 entries, 69980 to 68586
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   genre               75762 non-null  object
 1   id                  97123 non-null  int64 
 2   id_text             97123 non-null  int64 
 3   period              97123 non-null  object
 4   photo_up            97123 non-null  object
 5   subgenre            16060 non-null  object
 6   period_normed       97123 non-null  object
 7   era                 97123 non-null  object
 8   provenience_normed  97123 non-null  object
 9   height              60451 non-null  object
 10  width               60449 non-null  object
 11  thickness           59416 non-null  object
 12  year_range          94595 non-null  object
 13  provenience         84347 non-null  object
dtypes: int64(2), object(12)
memory usage: 11.1+ MB


In [12]:
df.era.value_counts()

era
early_bronze       36634
iron               30800
mid_late_bronze    29689
Name: count, dtype: int64

In [13]:
df.sample(5)

Unnamed: 0,genre,id,id_text,period,photo_up,subgenre,period_normed,era,provenience_normed,height,width,thickness,year_range,provenience
48047,Administrative,122988,275537,Old Akkadian (ca. 2340-2200 BC),600ppi 20160630,,Old Akkadian,early_bronze,Nippur,?,?,?,2340-2200 BC,Nippur (mod. Nuffar)
40786,,109256,259971,Achaemenid (547-331 BC),600ppi 20160630,,Achaemenid,iron,unknown,,,,,
48048,Administrative,122989,275538,Old Akkadian (ca. 2340-2200 BC),600ppi 20160630,,Old Akkadian,early_bronze,Nippur,?,?,?,2340-2200 BC,Nippur (mod. Nuffar)
77474,,237797,403203,Neo-Assyrian (ca. 911-612 BC),600ppi 20160630,,Neo-Assyrian,iron,Nineveh,,,,911-612 BC,Nineveh (mod. Kuyunjik)
72681,,232967,398314,Neo-Assyrian (ca. 911-612 BC),600ppi 20160630,,Neo-Assyrian,iron,Nineveh,,,,911-612 BC,Nineveh (mod. Kuyunjik)


# Use the CDLI API to scrape the images of the tablets from the dataset

##### The images are resized to 512*512 and kept in greyscale format

In [6]:
def get_photo_url(ID, suffix=''):
    return f'https://cdli.ucla.edu/dl/photo/P{ID}{suffix}.jpg'

In [7]:
def get_photo(ID, suffix=''):
    URL = get_photo_url(ID, suffix=suffix)
    res = requests.get(URL, verify=False)
    
    if res.ok:
        photo = np.asarray(Image.open(BytesIO(res.content)))
        photo = cv2.cvtColor(photo, cv2.COLOR_RGB2GRAY)

        return photo
    
    else:
        if suffix == '':
            return get_photo(ID, suffix='_d')
        elif suffix == '_d':
            return get_photo(ID, suffix='_e')
        else:
            logging.warning(f'HTTP {res.status_code} on ID {ID}')

In [8]:
RESIZE_SHAPE = (512, 512)

In [9]:
def photo_scrape(ID, resize_shape=(512,512)):
    missing_zeros = 6 - len(str(ID))
    ID = '0' * missing_zeros + str(ID)
    OUT_FN = f'output/images_new/{ID}.png'
    if not os.path.exists(OUT_FN):
        try:
            P = get_photo(ID) 
            if P is not None:
                # Convert to grayscale if not greyscale already
                if len(P.shape) == 3 and P.shape[2] == 3:
                    P = cv2.cvtColor(P, cv2.COLOR_BGR2GRAY)
                
                # Calculate the new size, preserving the aspect ratio
                h, w = P.shape[:2]
                scale = min(resize_shape[1] / h, resize_shape[0] / w)
                new_size = (int(w * scale), int(h * scale))
                
                # Resize the image
                P_resized = cv2.resize(P, new_size, interpolation=cv2.INTER_AREA)
                
                new_image = np.zeros(resize_shape[::-1], dtype=np.uint8)  # numpy uses (height, width)
                
                # Calculate top-left corner coordinates to center the resized image
                top_left_x = (resize_shape[0] - new_size[0]) // 2
                top_left_y = (resize_shape[1] - new_size[1]) // 2
                
                # Place the resized image onto the black background
                new_image[top_left_y:top_left_y+new_size[1], top_left_x:top_left_x+new_size[0]] = P_resized
                
                # Save the new image
                Image.fromarray(new_image).save(OUT_FN)
                return
        except Exception as e:
            print(e, '\n')
        except KeyboardInterrupt:
            return
    return


In [10]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
Parallel(n_jobs=10)(delayed(photo_scrape)(row.id_text) for row in tqdm(df.itertuples(), total=df.shape[0]))

  0%|          | 0/97716 [00:00<?, ?it/s]

