In [30]:
import cv2 
import pytesseract
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import warnings
warnings.filterwarnings('ignore')

def get_grids(pic):
    """ Get a list of gray cube of the sudoku quiz.

        : param img: path of the image

        : return : a list of grid
    """
    img = cv2.imread(pic)
    perc = 0.5
    resized = cv2.resize(img
                         , (int(img.shape[1]*perc), int(img.shape[0]*perc))
                         , interpolation=cv2.INTER_AREA)

    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    # remove the edge
    old_width=gray.shape[0]
    old_height=gray.shape[1]
    gray = gray[1:old_width-1, 1:old_height-1]
    del old_width, old_height

    imgwidth=gray.shape[0]
    imgheight=gray.shape[1]

    M = imgwidth//9
    N = imgheight//9
    it=2 # interpolation

    tiles = []
    for x in range(9):
        bound_x = x//3
        for y in range(9):
            bound_y = y//3

            tile = gray[x*M+it+bound_x:x*M+M-it+bound_x, y*N+it+bound_y:y*N+N-it+bound_y]
            tiles.append(tile)
    
    return tiles

def tiles2sdk(tiles):
    custom_config = r'--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789'
    return [pytesseract.image_to_string(c, lang='eng', config=custom_config).strip() for c in tiles]

def sdk_easy_printer(values):
    for i in range(9):
        print(' '.join([i if i!='' else '0' for i in values[i*9:(i+1)*9]]))

def save_sdk(src, out, counter):
    import os
    spics = os.listdir(src)
    for ix, spic in enumerate(spics):
        _, diff, id = spic.split('_')
        print(f'From {src}/{spic} to {out}/{diff}.txt')
        stiles = get_grids(pic=f'{src}/{spic}')
        svalues = tiles2sdk(stiles)

        flag = True
        write_content = ''
        
        for chara in svalues:
            if chara not in ''.join([str(i) for i in range(10)]):
                flag = False
                break
            write_content += chara if chara != '' else '0'

        if flag:
            # Open a file with access mode 'a'
            sfile = open(f'{out}/{diff}.txt', 'a')
            # Append 'hello' at the end of file
            
            sfile.write(id[:-4]+'\t'+write_content+'\n')
            # Close the file
            sfile.close()

        if counter==True and ix%5==0:
            print(f'{ix} pics processed.')

In [31]:
# volume [226, 230]
# year, volumne, article content, title, authors, pages

options=webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors') # 类似于warnings中的filterwarnings('ignore')

# your webdrivre route
chromedriver = r'E:\3,HKU\Courses\sem1 STAT7008-Programming for Data Science\HW-7008\assignment 3\chromedriver.exe'

# base_url = 'www.sciencedirect.com'
url = 'https://sudoku.com/'

driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=options)
driver.maximize_window()

driver.get(url)

driver.implicitly_wait(0.5)

difficulty = {'easy':'easy/', 'medium':'medium/', 'hard':'hard/', 'expert':'expert/', 'evil':'evil/'}
# difficulty = {'medium':'medium/', 'hard':'hard/', 'expert':'expert/', 'evil':'evil/'}


COUNT = 240
for ix, diff in enumerate(list(difficulty.keys())):
    difficulty_url_suffix = difficulty[diff]
    current_url = url + difficulty_url_suffix

    for _ in range(COUNT):
        driver.get(current_url)
        driver.implicitly_wait(0.5)
        pic_name = f'./pics/{ix}_{diff}_{time.time()}.jpg'

        # Remove ads
        flag = True
        while flag:
            try:
                driver.execute_script("""
                    var l = document.getElementsByClassName("game-tip")[0];
                    l.parentNode.removeChild(l);
                """)
            except:
                flag=False

        sdk_continer = driver.find_element(By.XPATH, '//*[@id="game"]')
        sdk_continer.screenshot(pic_name)


root = './pics'
out = './sudoku_data'

save_sdk(root, out, counter=True)

From ./pics/0_medium_1670006429.1187258.jpg to ./sudoku_data/medium.txt
0 pics processed.
From ./pics/0_medium_1670006431.269723.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006432.927733.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006434.3087256.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006435.9507384.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006437.3277366.jpg to ./sudoku_data/medium.txt
5 pics processed.
From ./pics/0_medium_1670006438.9467351.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006440.3737228.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006441.6487284.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006442.913984.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006444.0729744.jpg to ./sudoku_data/medium.txt
10 pics processed.
From ./pics/0_medium_1670006445.558976.jpg to ./sudoku_data/medium.txt
From ./pics/0_medium_1670006446.9020398.jpg to ./sudoku_data/medium.txt
From ./pics/0

In [19]:
sdk_easy_printer(values)

1 0 0 0 4 0 0 9 0
0 0 7 0 6 0 0 0 0
0 8 0 7 0 9 3 0 0
0 0 2 3 0 7 0 1 0
0 0 0 6 0 0 0 0 0
0 4 0 0 0 0 0 0 2
5 0 0 0 0 0 8 0 0
0 0 4 2 0 1 0 3 0
0 0 0 0 9 0 0 0 0


In [10]:
file_name = os.listdir(pics)[0]
rank, diff, id = file_name.split('_')

flag = True
write_content = ''
for chara in res:
    if chara not in ''.join([str(i) for i in range(10)]):
        flag = False
        break
    write_content += chara if chara!='' else '0'

if flag:
    # Open a file with access mode 'a'
    sfile = open(f'./sudoku_data/{diff}.txt', 'a')
    # Append 'hello' at the end of file
    
    sfile.write(id[:-4]+'\t'+write_content+'\n')
    # Close the file
    sfile.close()

In [11]:
for i in range(9):
    print(' '.join([i if i!='' else '0' for i in res[i*9:(i+1)*9]]))

3 0 5 6 2 9 0 0 7
7 0 6 1 0 8 0 0 0
8 0 1 0 0 0 2 6 5
0 0 3 0 0 5 0 7 0
6 8 7 0 0 0 0 0 0
2 0 0 7 0 0 6 0 0
4 7 9 5 8 0 0 2 0
1 0 0 4 3 0 5 0 9
0 0 8 9 0 0 0 0 6


In [40]:
# for x in range(0, imgwidth, M):
#     for y in range(0, imgheight, N):
#         x1 = x + M
#         y1 = y + N
#         tiles = gray[x:x+M,y:y+N]

#         grid = cv2.rectangle(tiles, (x+it, y+it), (x1-it, y1-it), (0,0,255))
#         print(pytesseract.image_to_string(grid, config=custom_config))
#         break
#     break


In [17]:
import os

root = os.getcwd()
pics = os.path.join(root, 'pics')

keys = list(difficulty.keys())
ix = list(range(len(keys)))
key2ix = dict(zip(keys, ix))

for pic in os.listdir(pics):
    parts = pic.split('_')
    if len(parts)==3:
        continue
    part0 = parts[0][4:]
    rename = f'{key2ix[part0]}_{part0}_{parts[1]}'
    os.rename(os.path.join(pics, pic), os.path.join(pics, rename))