# Scrape Seals

### Currently operable with zdic website

Make sure to add characters to main data store with the add_new_batch script, before trying to scrape their images.

If scraping on characters listed in missing_chars.csv, set retry = True. If scraping on all characters, set retry = False

In [2]:
import csv
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import cssutils
from PIL import Image
import urllib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
from wand.api import library
import wand.color
import wand.image
import time
import sys

data_dir = './source'
# scrape_url_base = 'https://hanziyuan.net/#'
source = {
    1: 'https://www.zdic.net/hans/',
    2: 'https://www.cidianwang.com/shuowenjiezi/'
}
scrape_url_base = 'https://www.zdic.net/hans/'
new_image_filetype = 'png'
log = 'log.txt'
retry = False

if not os.path.exists(os.path.join(data_dir, log)):
    log = open(os.path.join(data_dir, log), 'x', encoding='utf8')

In [3]:
def load_data(file):
    char_array = np.genfromtxt(os.path.join(data_dir, file), delimiter=',', encoding='utf8', dtype=None) 
    print(f"Imported {char_array.shape[0]} characters ")
    
    return char_array

def log_error(error):
    with open(os.path.join(data_dir, log), 'a', encoding='utf8') as f:
        f.write(error + '\n')
        
        f.close()
        

In [19]:
# Get the image url from the site for given character
def scrape_image(url, char, index):
    url_list = []
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("img", {"class": "lazy kxtimg"})
    for x in results:
        attr = x.attrs['data-original']
        
        # Search for different variants
        if ('swxz' in attr): # or ('swdz' in attr):
            print(attr)
            url_list.append(attr)
    if len(url_list) == 0:
        log_error(f'Char: {index}. Could not locate seal script image - maybe does not exist')
        return None
    
    return url_list

# Website is dynamic so path needs to be manually changed
def recreate_img_path(path):
    inilist = [m.start() for m in re.finditer(r"/", path)]
    try:
        new_path = 'https:' + path[0:inilist[2]+1] + 'swxz' + path[inilist[3]:]
        
        return new_path
    except:
        print('Caught error while changing paths')
        print('HTML structure may have changed (images no longer located in swxz folder or url changed), needs diagnosis')
        
def save_image(url_list, files, index, curr_char_dir):
#       seal_image_url = recreate_img_path(img_path) # Url of image
#           print(f'URL of image: {seal_image_url}')
    curr_image_count = str(int(files[-1][files[-1].index('_')+1:files[-1].index('.')])) if not len(files) == 0 else '1'     
    print(url_list)
    for url in range(len(url_list)):
        img_path = 'https:' + url_list[url]
        new_image_variant_number =  str(int(curr_image_count) + url)
#         data = requests.get(img_path).content # Get svg image

        # Get the filename and full path of the new image of currently scraped character
        new_image_filename = str(index) + '_' + new_image_variant_number
        new_image_path = f'{os.path.join(curr_char_dir, new_image_filename)}.{new_image_filetype}'
        print(f'Filename for new image is: {new_image_filename}')


        # Retrieve SVG file from url obtained from scraping and save SVG file
        svg_filename, headers = urllib.request.urlretrieve(img_path, os.path.join(curr_char_dir, f"{new_image_filename}.svg"))#img_path[img_path.rfind('/')+1:]))
        print(svg_filename)

        with open(svg_filename, "r") as f:
            svg_blob = f.read().encode('utf-8')
            with wand.image.Image( blob=svg_blob, format="svg" ) as image:
                png_image = image.make_blob("png")
                fp = io.BytesIO(png_image)
                with fp:
                    img = mpimg.imread(fp, format='png')
                    plt.imshow(img, cmap='gray')
                    plt.axis('off')
                    plt.savefig(new_image_path)

                    fp.close()
                f.close()
            os.remove(svg_filename)

            print(f'Successfully saved new image at {new_image_filename}')
        time.sleep(3)

def scrape_seal(index, char):
    try:
        print(f"Searching for character {index}: {char}")
        scrape_url = scrape_url_base + char
        print(f"Searching on URL {scrape_url}")

        curr_char_dir = os.path.join(data_dir, str(index))
        files = os.listdir(curr_char_dir)
        files = [f for f in files if os.path.isfile(curr_char_dir+'/'+f)]

        url_list = scrape_image(scrape_url, char, index)

        # Save each image of the character
        if url_list != None:
            print(f'Found {len(url_list)} images for character')
    #             print(url_list)
            save_image(url_list, files, index, curr_char_dir)
    
    except:
        e = sys.exc_info()[0]
        msg = f'Failed to obtain char {index}, with error: {e}'
        log_error(msg)   

In [12]:
### MAIN FUNCTION FOR SCRAPING
def main(retry):
    if retry:
        missed_chars = load_data('missing_chars.csv') # Import characters to scrape
    else:
        char_array = load_data('hsk.csv')
    for char in range(char_array.shape[0]):
        # Get index and character symbol from batch array
        curr_index = char_array[char][0]
        curr_char = char_array[char][1]
        if retry:
             if str(curr_index) in missed_chars:
                scrape_seal(curr_index, curr_char)
        else:
            scrape_seal(curr_index, curr_char)
    print()
    print("Finished scraping")

In [None]:
main(retry)

In [11]:
# url = 'https://www.zdic.net/hans/高'
# page = requests.get(url)
# soup = BeautifulSoup(page.content, "html.parser")
# results = soup.find_all("img", {"class": "lazy kxtimg"})
# for x in results:
#     print(x)
#     attr = x.attrs['data-original']
#     print()
#     if ('swxz' in attr) or ('swdz' in attr):
#         print("FOUND IT")

