# ILLUSTRATION EXTRACTION

In this notebook, with the results of the segmentation model all saved into different json files, we crop the images to save all the illustrations.


In [1]:
# Some imports
import pandas as pd
from PIL import Image
import os
import glob
import json
import cv2
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from tqdm import tqdm

In [3]:
# Mounting Google Drive content
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
# Go into the directory where your notebook is
%cd gdrive/My Drive/newspaper-navigator/newspaper-navigator/notebooks

/content/gdrive/My Drive/newspaper-navigator/newspaper-navigator/notebooks


In [5]:
# Defining target directories
IMAGES_DIR = '../DFKV_data/DFKV/' # where the images are
PREDICTIONS_DIR = '../../DFKV_output/DFKV/' # where the segmentation ouptuts are (json files)
ILLUS_DIR = '../../DFKV_illustrations/DFKV/' # where we will save the illustrations
ILLUS_RATIO = '../../DFKV_output/DFKV/illu_ratios.json' # where we will save the illustrations ratios
ILLU_THRESHOLD = 0.99 

In [None]:
# Navigate (from the /notebook directory) to the directory of the images
# and find the list of all the images
os.chdir(IMAGES_DIR)
all_images = glob.glob("*.jpg")

In [None]:
# Where we save the percentage of the illustration's place on the page
im_ratio = dict()

# Go through all the images
for fn in tqdm(all_images):
    # open prediction file
    f = open(PREDICTIONS_DIR + fn.replace('.jpg', '.json'))
    preds = json.load(f)

    # open image
    image = Image.open(fn)
    im_h, im_w = image.size

    # go through all predicted illustrations
    for i, box in enumerate(preds['boxes']):
        if preds['scores'][i] > ILLU_THRESHOLD:
            # crop and save illustration
            illu = image.crop((im_h*box[0],im_w*box[1], im_h*box[2], im_w*box[3]))
            illu.save(ILLUS_DIR + 'ILLU_' + preds['doc_id'] + '_' + preds['page_number'] + '_' + str(i) + '.jpg')
            # compute ratio of place of the illustration : area_illustration/area_whole_page
            im_ratio['ILLU_' + preds['doc_id'] + '_' + preds['page_number'] + '_' + str(i)] = (im_h*box[2] - im_h*box[0])*(im_w*box[3] - im_w*box[1])/(im_h*im_w)

with open(ILLUS_RATIO, 'w') as outfile:
    json.dump(im_ratio, outfile)

In [10]:
with open('../DFKV_output/illu_ratios.json', 'r') as f:
    data = json.load(f)