# Scrape seals

### Operates on www.cidianwang.com/shouwenjiezi

Finds images for characters listed in missing_chars.csv (characters without images).
When an image for a character cannot be found on zdic.net (scrape_seals.ipynb), this script is used due to its larger repo of images, but is slower to execute due to the website structure.

If scraping a manually input array, change *manual* to True in first cell, and update *manual_url* array. Structure of array is [[character (string), url (string), character index (int)], ...]

Error messages (e.g. cannot find image on website) are logged to **log_2.txt**


In [8]:
import csv
import os
import numpy as np
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
import cssutils
from PIL import Image
import urllib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
from wand.api import library
import wand.color
import wand.image
import time
import sys
import httplib2
from scipy.ndimage import zoom

data_dir = './source'
# scrape_url_base = 'https://hanziyuan.net/#'

scrape_url_base = 'https://www.cidianwang.com/shuowenjiezi/'
new_image_filetype = 'png'
log = 'log_2.txt'
retry = True

manual = False

if not os.path.exists(os.path.join(data_dir, log)):
    log = open(os.path.join(data_dir, log), 'x', encoding='utf8')

In [72]:
manual_url = [["系", "https://www.cidianwang.com/shuowenjiezi/xi2817.htm", 10],
             ["狗","https://www.cidianwang.com/shuowenjiezi/gou2295.htm",100],
             ["饼","https://www.cidianwang.com/shuowenjiezi/bing4344.htm", 1013],
             ["识", "https://www.cidianwang.com/shuowenjiezi/shi3568.htm", 102],
             ["歉", "https://www.cidianwang.com/shuowenjiezi/qian1873.htm", 1029],
             ["排", "https://www.cidianwang.com/shuowenjiezi/pai1474.htm", 1046],
             ["材", "https://www.cidianwang.com/shuowenjiezi/cai1677.htm", 1065],
             ["骄", "https://www.cidianwang.com/shuowenjiezi/jiao4375.htm", 1068],
             ["阅", "https://www.cidianwang.com/shuowenjiezi/yue4154.htm", 1075],
             ["再", "https://www.cidianwang.com/shuowenjiezi/zai319.htm", 122],
             ["点", "https://www.cidianwang.com/shuowenjiezi/dian2212.htm", 177],
             ["丈", "https://www.cidianwang.com/shuowenjiezi/zhang31.htm", 188],
             ["汽", "https://www.cidianwang.com/shuowenjiezi/qi1945.htm", 210],
             ["踢", "", 215],
             ["洗", "https://www.cidianwang.com/shuowenjiezi/xi2011.htm", 244],
             ["唱", "https://www.cidianwang.com/shuowenjiezi/chang624.htm", 293],
             ["报", "https://www.cidianwang.com/shuowenjiezi/bao1406.htm", 310],
             ["睛", "", 312],
             ["糖", "https://www.cidianwang.com/shuowenjiezi/tang2812.htm", 366],
             ["根", "https://www.cidianwang.com/shuowenjiezi/gen1760.htm", 418],
             ["参", "", 424],
             ["选", "https://www.cidianwang.com/shuowenjiezi/xuan3876.htm", 460],
             ["辆", "", 494],
             ["邮", "https://www.cidianwang.com/shuowenjiezi/you3936.htm", 495],
             ["调", "https://www.cidianwang.com/shuowenjiezi/tiao3620.htm", 537],
             ["裤", "", 604],
             ["孤", "https://www.cidianwang.com/shuowenjiezi/gu921.htm", 672],
             ["乓", "", 685],
             ["熟", "", 715],
             ["忆", "", 771],
             ["倍", "https://www.cidianwang.com/shuowenjiezi/bei234.htm", 772],
             ["擦", "", 775],
             ["龄", "https://www.cidianwang.com/shuowenjiezi/ling4540.htm", 776],
             ["尊", "https://www.cidianwang.com/shuowenjiezi/zun988.htm", 789],
             ["速", "https://www.cidianwang.com/shuowenjiezi/su3889.htm", 820],
             ["堵", "https://www.cidianwang.com/shuowenjiezi/du763.htm", 845],
             ["租", "https://www.cidianwang.com/shuowenjiezi/zu2656.htm", 85],
             ["植", "https://www.cidianwang.com/shuowenjiezi/zhi1811.htm", 853],
             ["咳", "https://www.cidianwang.com/shuowenjiezi/ke590.htm", 860],
             ["针", "https://www.cidianwang.com/shuowenjiezi/nie10040.htm", 864],
             ["住", "", 87],
             ["许", "https://www.cidianwang.com/shuowenjiezi/xu3557.htm", 880],
             ["族", "https://www.cidianwang.com/shuowenjiezi/zu1580.htm", 885],
             ["钥", "", 906],
             ["盐", "https://www.cidianwang.com/shuowenjiezi/yan2509.htm", 914],
             ["聊", "https://www.cidianwang.com/shuowenjiezi/liao3014.htm", 935],
             ["够", "", 958],
             ["那", "https://www.cidianwang.com/shuowenjiezi/na3932.htm", 97],
             ["释", "https://www.cidianwang.com/shuowenjiezi/shi4022.htm", 999]]

In [77]:
def log_error(error):
    with open(os.path.join(data_dir, log), 'a', encoding='utf8') as f:
        f.write(error + '\n')
        
        f.close()

def load_data(csv):
    data = np.genfromtxt(os.path.join(data_dir, csv), delimiter=',', encoding='utf8', dtype=None)
    print(f"Imported {data.shape[0]} characters from index {csv}")
    
    return data

def scrape_image(url, char, index):
    try:
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"}
        page = requests.get(url, headers=headers) 
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find("img", {"class": "jzimg"})

        print(results.attrs["src"])

        opener = urllib.request.build_opener()
        opener.addheaders = [('User-Agent', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")]
        urllib.request.install_opener(opener)

        gif_filename, headers = urllib.request.urlretrieve(results.attrs["src"], os.path.join(data_dir, str(index), f"{index}_1.gif"))#img_path[img_path.rfind('/')+1:]))
        print(gif_filename)

        format_save_image(gif_filename)
#         plt.imshow(img)
#         plt.show()
    except:
        e = sys.exc_info()[0]
        msg = f'Char {index}. Failed to obtain with error: {e}'
        print(msg)
        log_error(msg)

def get_page_urls(url, data):

    url_char_list = []
    
    # ADD BROWSER HEADER TO AVOID 403 FORBIDDEN HTML ERROR
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"}
    page = requests.get(url, headers=headers) 
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("a")

    for r in results:
        if "title" in r.attrs:
            char = r.attrs["title"][0]
            if char in data['f0']: # Title of first column of data np.array
                char_link = r.attrs["href"]
                search_url = scrape_url_base + char_link[char_link.index("/", 2)+1:]
                url_char_list.append([char, search_url])
                
    # Add character index to [char, url]            
    for x in range(len(data)):
        for y in range(len(url_char_list)):
            if data[x][0] == url_char_list[y][0]:
                url_char_list[y].append(data[x][1])
                break
    
    return url_char_list

# Zooms in to centre of image
def clipped_zoom(img, zoom_factor, **kwargs):

    h, w = img.shape[:2]

    # For multichannel images we don't want to apply the zoom factor to the RGB
    # dimension, so instead we create a tuple of zoom factors, one per array
    # dimension, with 1's for any trailing dimensions after the width and height.
    zoom_tuple = (zoom_factor,) * 2 + (1,) * (img.ndim - 2)

    # Zooming out
    if zoom_factor < 1:

        # Bounding box of the zoomed-out image within the output array
        zh = int(np.round(h * zoom_factor))
        zw = int(np.round(w * zoom_factor))
        top = (h - zh) // 2
        left = (w - zw) // 2

        # Zero-padding
        out = np.zeros_like(img)
        out[top:top+zh, left:left+zw] = zoom(img, zoom_tuple, **kwargs)

    # Zooming in
    elif zoom_factor > 1:

        # Bounding box of the zoomed-in region within the input array
        zh = int(np.round(h / zoom_factor))
        zw = int(np.round(w / zoom_factor))
        top = (h - zh) // 2
        left = (w - zw) // 2

        out = zoom(img[top:top+zh, left:left+zw], zoom_tuple, **kwargs)

        # `out` might still be slightly larger than `img` due to rounding, so
        # trim off any extra pixels at the edges
        trim_top = ((out.shape[0] - h) // 2)
        trim_left = ((out.shape[1] - w) // 2)
        out = out[trim_top:trim_top+h, trim_left:trim_left+w]

    # If zoom_factor == 1, just return the input array
    else:
        out = img
    return out

# Return image im with size shape_from after padding with 255 (white) to size shape_to
def pad_image(shape_to, shape_from, im):
    # Amount of pixels to pad before and after rotated image in x and y directions, to return to original size
    pad_x = int(np.ceil((shape_to[1]-shape_from[1])/2))
    pad_y = int(np.ceil((shape_to[0]-shape_from[0])/2))
    
    padded_image = np.pad(im, ((500,500),(500,500)), 'constant', constant_values=(255))

#     print(padded_image.shape[0]//2-shape_to[0]//2,padded_image.shape[0]//2+shape_to[0]//2)
    cropped_image = padded_image[padded_image.shape[0]//2-shape_to[0]//2:padded_image.shape[0]//2+shape_to[0]//2, padded_image.shape[1]//2-shape_to[1]//2:padded_image.shape[1]//2+shape_to[1]//2]
    
    return cropped_image

def format_save_image(filename):
    img = Image.open(filename).convert('L')
    
    im = np.asarray(img)
#     print(im.shape)
#     print(im)

    # im is read-only
    im = im.copy()

    # remove watermark through thresholding
    thres = 140
    im[im > thres] = 255
    im[im <= thres] = 0

    im_padded = pad_image((512, 512), im.shape, im)
    im_zoomed = clipped_zoom(im_padded, 2)
    
    if os.path.exists(filename):
        os.remove(filename)
    
    new_image_path = f'{filename[:filename[1:].index(".")+1]}.png'
    
    plt.imsave(new_image_path, im_zoomed, cmap='gray')
    print(f"Saved new image to {new_image_path}")
    

In [55]:
def main(manual=False):
    if not manual:
        data = load_data('missing_chars.csv') # Import characters to scrape
        print(data)
        url_list = get_page_urls(scrape_url_base, data)
    else:
        url_list = manual_url
        print(url_list)

    for i in range(len(url_list)):
        print(f'Searching for character {url_list[i][0]} index {url_list[i][2]}')
        scrape_image(url_list[i][1], url_list[i][0], url_list[i][2])

    
main(manual)

In [None]:
### FOR TESTING

img = Image.open(os.path.join(data_dir, "756", "756_1.gif")).convert('L')
    
im = np.asarray(img)
print(im.shape)
print(im)

# im is read-only
im = im.copy()

# remove watermark through thresholding
thres = 140
im[im > thres] = 255
im[im <= thres] = 0

im_padded = pad_image((512, 512), im.shape, im)

# im_downsampled = np.zeros((im_padded.shape[0]//2, im_padded.shape[1]//2))

# for i in range(im_padded.shape[0]):
#     for j in range(im_padded.shape[1]):
        
#         im_downsampled[i//2, j//2] = im_padded[i, j]

im_zoomed = clipped_zoom(im_padded, 2)

plt.imshow(im_zoomed, cmap='gray')
plt.show()