# Sudoku Search for Training Data

In [59]:
import os
import uuid
import hashlib
import torch
import ray
from fastbook import *

In [60]:
imageDir = "./train/images"

In [65]:
urls = search_images_ddg('sudoku', max_images=400)
len(urls),urls[0]

(400,
 'https://printablesudokufree.com/wp-content/uploads/2019/05/easy-printable-sudoku-rtrs-online-printable-sudoku-easy-6x6-791x1024.jpg')

In [66]:
def calculateMd5(filename):
    runningMd5 = hashlib.md5()
    with open(filename, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            runningMd5.update(chunk)
    return runningMd5.hexdigest()

In [67]:
def doesContentAlreadyExist(filename, dir):
    newFileHash = calculateMd5(filename)

    for f in os.listdir(dir):
        absFile = os.path.join(dir, f)
        if os.path.isfile(absFile):
            hash = calculateMd5(absFile)
            if (hash == newFileHash):
                return True

    return False

In [None]:
for url in urls:
    url = url.split('?')[0]
    print ("Processing URL: " + url)
    e = re.findall('\.[a-z]+$', url)
    if (len(e)) == 1:
        fileExtension = e[0]
        randomName = str(uuid.uuid4().hex)
        filename = randomName + fileExtension
        tempFilename = "/tmp/" + filename
        
        download_url(url, tempFilename)

        if doesContentAlreadyExist(tempFilename, imageDir) == True:
            print ("Found: " + tempFilename)
            os.remove(tempFilename)
        else:
            print ("NEW: " + tempFilename)
            os.replace(tempFilename, imageDir + "/" + filename)

    else:
        print ("Unable to process image due to unexpected extension: " + e + " with URL: " + url)

Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/easy-printable-sudoku-rtrs-online-printable-sudoku-easy-6x6-791x1024.jpg


Found: /tmp/b9bb0e2d74ed4d22b8e85579ac6aec90.jpg
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/04/sudoku-puzzles.jpg


Found: /tmp/e70b7d62f9314eb696255d53d0fe30f3.jpg
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/04/super-samurai-sudoku-13-grids-21.png


Found: /tmp/16ebe09094d545b78c711155b8bd5893.png
Processing URL: https://paraimprimir.org/wp-content/uploads/2013/07/Sodoku-nivel-extremo-para-imprimir.jpg


Found: /tmp/2fa8c97eb78b44b4a6acaefc914a2dcc.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/easy-sudoku-printable-canas-bergdorfbib-co-printable-sudoku-large-print.jpg


Found: /tmp/8c0562ecdcb04456bd3b5a4d6756ca08.jpg
Processing URL: http://www.printable-sudoku-puzzles.com/6x6.php


NEW: /tmp/676359aa47c5457891ca5e0dd1a45ef2.php
Processing URL: http://4.bp.blogspot.com/-0Sj3-pFEzlo/Tm717rQQtTI/AAAAAAAAK78/9fiuYU6QowI/s1600/easy+print+sudoku.jpg


Found: /tmp/cb59f7ebb1534dea9f92958526b00168.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/sudoku-blank-under-bergdorfbib-co-printable-sudoku-1-per-page.png


Found: /tmp/23c2e4ebfc124df79af73cfde9d81f70.png
Processing URL: https://lyanacrosswordpuzzles.com/wp-content/uploads/2019/06/hard-puzzle-free-sudoku-puzzles-printable-sudoku-4-per-page-printable-sudoku-puzzles-4-per-page.jpg


Found: /tmp/cdd52009ecef48f29aacda4720605a1e.jpg
Processing URL: https://www.enorah.fr/wp-content/uploads/2016/11/Sudoku-en-ligne-1.jpg


Found: /tmp/46bed107740b4aa9a388979d0ba1d3c8.jpg
Processing URL: https://uploads.guim.co.uk/2016/12/03/kill_dec04_2016.jpg


Found: /tmp/94a84085328d4c21abdf87c3ced95405.jpg
Processing URL: https://sudoku.com/img/confetti3@2x.png


Found: /tmp/c739e2dc40f84b1f98c17158051e14dd.png
Processing URL: https://free-printablehq.com/wp-content/uploads/2019/07/easy-sudoku-printable-kids-activities-free-printable-sudoku-with-answers-1024x1024.jpg


Found: /tmp/f9291a7b4e1f43cb8781f5a60503d77c.jpg
Processing URL: https://i.pinimg.com/736x/ef/b2/f6/efb2f666be3e8ef6933386b1e05432da--sudoku-crossword.jpg


Found: /tmp/d8bb361f2f6a435da88a2827110934b7.jpg
Processing URL: https://crosswordpuzzles-printable.com/wp-content/uploads/2019/06/printable-sudoku-puzzles-6-per-page-download-them-or-print-free-printable-sudoku-puzzles-4-per-page.jpg


Found: /tmp/5594837da43b4d8c9393fca62683a31a.jpg
Processing URL: http://www.filebuzz.com/software_screenshot/full/25972-analytical_sudoku.gif


Found: /tmp/c6605d6d040c493eb0012e080f1840eb.gif
Processing URL: https://uploads.guim.co.uk/2019/07/21/SU-4475_P_E_copy_2.jpg


Found: /tmp/04e93d861e574a01941d0faa95ea959b.jpg
Processing URL: https://sudoku.com/img/confetti1.png


Found: /tmp/1f8c3eb1fc894da3a79522573a7a1490.png
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/printable-sudoku-printable-samurai-sudoku-medium.jpg


Found: /tmp/93fcc312226e474681a41963200fa430.jpg
Processing URL: https://i.pinimg.com/originals/91/32/68/91326804e2f4ea58ebb49efd6c64429d.jpg


Found: /tmp/651ec686f0d4458f934c4895e72112a9.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/printable-sudoku-printable-samurai-sudoku-medium-1018x1024.gif


Found: /tmp/5982ebc4e4a341baac7a335c172fa5f9.gif
Processing URL: https://crosswordpuzzles-printable.com/wp-content/uploads/2019/06/sudoku-for-all-ages-plus-lots-of-other-printable-activities-for-kids-printable-sudoku-puzzles-9x9.png


Found: /tmp/924eb4851217494bbee1b3fe8bcb5000.png
Processing URL: https://www.activityshelter.com/wp-content/uploads/2017/04/soduko-for-kids-free.jpg


Found: /tmp/f199e617645744f990d818659c146b4a.jpg
Processing URL: https://www.printablee.com/postpic/2011/01/free-printable-16x16-sudoku-puzzles_370901.jpg


Found: /tmp/edd4ab15ca834306bcaee3c17fc31f59.jpg
Processing URL: https://www.puzzles.ca/sudoku_puzzles_images/sudoku_easy_998.gif


Found: /tmp/d06ddda3b6ad4f2f861f4f9c1d0aeb4a.gif
Processing URL: http://cdn.ilovefreesoftware.com/wp-content/uploads/2011/12/sudoku.jpg


Found: /tmp/5a6f01a1ff2b415a8475372fdcd2c12d.jpg
Processing URL: https://www.gmpuzzles.com/images/blog/GM-SudokuEx.png


Found: /tmp/3676bd394def4569b4e768f698630108.png
Processing URL: https://tggrouplinks.com/wp-content/uploads/2023/12/Sudoku-Telegram-Group-Links-List.webp


Found: /tmp/32a230a7ac434058bcb2a80fe46c86a8.webp
Processing URL: https://i.pinimg.com/originals/ce/48/56/ce4856cd7ec5e4aaf10ac3985c5a4067.jpg


Found: /tmp/3f5496fe4bda434587fb626bd4afac81.jpg
Processing URL: https://i.pinimg.com/originals/63/52/17/635217cafebcaffada30a1fef50f73b8.jpg


Found: /tmp/3043c00d9adc4e30a50d7e534542a744.jpg
Processing URL: https://images.sftcdn.net/images/t_app-cover-l,f_auto/p/ff149291-9bd9-46fd-a56e-c77f2e494bc7/265877251/sudoku-free-puzzles-screenshot.png


Found: /tmp/352d644fb21548a295d011d177218b7b.png
Processing URL: https://free-printable-az.com/wp-content/uploads/2019/06/observer-killer-sudoku-life-and-style-the-guardian-killer-sudoku-free-printable-2.jpg


Found: /tmp/b692c0c9dbe34068b51afa11bde0aa23.jpg
Processing URL: https://i.ytimg.com/vi/Byr2E_rpVsM/maxresdefault.jpg


Found: /tmp/7753029e3b6444c6acf12417139c5eba.jpg
Processing URL: https://www.innoludic.com/images/jeux/super_00011.png


Found: /tmp/91ff77f25cf446239a824309609af0ed.png
Processing URL: https://www.rd.com/wp-content/uploads/2020/12/Sudoku27.jpg


Found: /tmp/b43d675c88994bfe83c104165cedbe12.jpg
Processing URL: https://muster-vorlage.ch/wp-content/uploads/2013/12/Sudoku-zum-Ausdrucken-1.jpg


Found: /tmp/b829bd1f8cae4270b83eaca06ef0bcca.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/sudoku-templates-under-bergdorfbib-co-free-printable-irregular-sudoku-1.jpg


Found: /tmp/0f8bc63b2a6c4b17b4e80de3e6bae7e5.jpg
Processing URL: https://printable-crosswordpuzzles.com/wp-content/uploads/2019/06/sudoku-puzzler-free-printable-updated-sudoku-puzzles-with-a-printable-sudoku-puzzles-easy-1.jpg


Found: /tmp/4937da09df2e40fe9418e8f7fac5f186.jpg
Processing URL: https://i.pinimg.com/474x/10/e6/ea/10e6ea461759eeec73115fe564e80919--sudoku-puzzles-puzzles-for-kids.jpg


Found: /tmp/41de9afdd8b3473f972ec09c11e6c211.jpg
Processing URL: https://i.pinimg.com/originals/d5/bb/31/d5bb31daef45ff8511d8213ca99ac426.jpg


Found: /tmp/79467231799946f1b81e1d37ba30bf49.jpg
Processing URL: http://4.bp.blogspot.com/-81BdcbpGoRk/T6_6JuX-LsI/AAAAAAAAAAc/nuKib-kohAU/s1600/sudoku+to+print+medium.jpg


Found: /tmp/2fdb8bc1f5804125807c322c98d3dab3.jpg
Processing URL: http://www.memory-improvement-tips.com/images/hard-1b-2.jpg


Found: /tmp/bf693694a0514d8595be561c3ef3bc4d.jpg
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/03/the-best-printable-blank-sudoku-4-per-page-obriens-website-9.jpg


Found: /tmp/a3e257ba0824451f974cfc9a65e08889.jpg
Processing URL: https://i5.walmartimages.com/asr/80426d5b-f91f-4f1d-9d76-7eac44d225ea_1.e3e66566ead52a7d68b199d3c0bdacda.jpeg


Found: /tmp/d88091ff66004a70b5a433d9cad8f965.jpeg
Processing URL: http://coloringkids.org/wp-content/uploads/printable-sudoku-puzzles-9.jpg


Found: /tmp/2aa18111fb1943618d541aedbcacc97f.jpg
Processing URL: https://www.freepdfmagazine.com/puzzlelife-puzzlepad-sudoku-super-november-2023/PuzzleLife-PuzzlePad-Sudoku-Super-November-2023.jpg


Found: /tmp/7277a02359e740c58d060d64a8757850.jpg
Processing URL: https://3.bp.blogspot.com/-WpXnBAjKNF0/ThmKiAD_TsI/AAAAAAAAAX4/zlvGhDlXY1U/s1600/21.jpg


Found: /tmp/c41a9bf6240947a3a88aff4b837b1d42.jpg
Processing URL: https://2.bp.blogspot.com/-3ZKRPPnHE1M/Tj7xWYqoBiI/AAAAAAAAELc/1vCdgiKLHW0/s1600/100-sudoku-facil_Page_010-764762.jpg


Found: /tmp/9030cb7ecda148ba83ca9796f83b027d.jpg
Processing URL: http://www.printable-sudoku-puzzles.com/filled_values/20.php


NEW: /tmp/3f435687f01743e3a60d1c165fde9e1d.php
Processing URL: https://akademie-fuer-lernmethoden.de/wp-content/uploads/2011/05/Sudoku.jpg


Found: /tmp/f093402744594d77b505a67abf8c5c34.jpg
Processing URL: https://www.elversonpuzzle.com/Printable-sudoku-puzzle-2.gif


Found: /tmp/9fb76e1c7601494588fa517b0a9d4481.gif
Processing URL: https://defbnszqe1hwm.cloudfront.net/images/sudoku.png


Found: /tmp/3afc1ad46ef049a08127c3cc8d4bea69.png
Processing URL: https://images.sftcdn.net/images/t_app-cover-l,f_auto/p/dff6c036-9b27-11e6-a9ba-00163ed833e7/2675625261/ahr-sudoku-screenshot.jpg


Found: /tmp/9403f9801b5148ae9c6acc95b1c0672e.jpg
Processing URL: https://c8.alamy.com/comp/2A0HFBW/sudoku-2A0HFBW.jpg


Found: /tmp/aee01e8ca65b4e67815047b5004924ee.jpg
Processing URL: https://preview.redd.it/help-pls-stuck-v0-iu7azp5sxt4c1.jpeg


NEW: /tmp/fd133da4c4344d73b2d63958a2105ef7.jpeg
Processing URL: https://aniviels.files.wordpress.com/2010/05/sudoku-solver-logo.gif


Found: /tmp/3e6504734edb4ac0b345dd4b2cef0ccb.gif
Processing URL: https://www.lactudecamille.com/wp-content/uploads/2017/04/imagessudoku-10.jpg


Found: /tmp/54bb3ae4b6cb40c9a0a95482f3233930.jpg
Processing URL: https://www.diya.fr/wp-content/uploads/2016/07/Soduku-en-ligne-10.jpg


Found: /tmp/602c89461e404fdd842e7ca1dc9afdc7.jpg
Processing URL: https://dailybraintrainer.com/wp-content/uploads/2023/10/sudoku-4455317_1280.jpg


Found: /tmp/5ed018a4a47d40498dae065127daa53b.jpg
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/04/best-free-sites-to-play-sudoku-online-1.jpg


Found: /tmp/9febe028abfb4b748f0a0a5c54c1cea9.jpg
Processing URL: https://4freeprintable.com/wp-content/uploads/2019/07/printable-sudoku-puzzles-room-surf-free-printable-sudoku-with-answers.jpg


Found: /tmp/da200e615f444e418b887ba6684e4d1a.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/super-samurai-sudoku-13-grids-sudoku-sudoku-puzzles-puzzle-es-printable-sudoku-16x16-easy.png


Found: /tmp/f3b5568a031c42a2bdc80442f3fc8f5d.png
Processing URL: https://i.ds.at/MxqVTA/rs:fill:1200:600/plain/lido-images/2023/05/24/665c7118-3330-47ed-8546-1d2d8947891a.png


Found: /tmp/acaaa810e96147eca55343d3ffc81f92.png
Processing URL: https://miro.medium.com/max/4000/1*z6Qli8oi_nOPQCyqo3rRVg.png


Found: /tmp/762bd5f21a2741ada93da43ed4693ccb.png
Processing URL: https://pianetabambini.it/wp-content/uploads/2021/03/Sudoku-4x4_14-1448x2048.png


Found: /tmp/1645650e779f49f1a826d0333c819dd6.png
Processing URL: https://www.escogitare.com/apps/sudoku/img/screenshot-01.jpg


Found: /tmp/02441f7e23994133b2b7812a9b79632c.jpg
Processing URL: https://sudoku-club.de/wp-content/uploads/2017/01/sudoku-400x400.png


Found: /tmp/076f4f258b8041b184ae45876ed6a006.png
Processing URL: https://my-sudoku.com/images/logo.png


Found: /tmp/a8168ed89cee43c0994139252daf8ac9.png
Processing URL: https://is1-ssl.mzstatic.com/image/thumb/Purple62/v4/35/78/41/35784163-02d5-e091-4dfd-6468e413f975/mzm.dcqqhjgo.png/1200x630wa.png


Found: /tmp/1b561e2b3e1d4506a4f1d0b571c65c39.png
Processing URL: http://static.guim.co.uk/sys-images/Guardian/Pix/pictures/2009/03/27/Sudoku-1215-hard.jpg


Found: /tmp/9357ac30becb4a6884442c973dbc495b.jpg
Processing URL: https://images.sftcdn.net/images/t_app-cover-l,f_auto/p/ff149291-9bd9-46fd-a56e-c77f2e494bc7/2252954298/sudoku-free-puzzles-screenshot.png


Found: /tmp/a8fea4a85f4449ec817846609c12f75f.png
Processing URL: https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Sudoku-by-L2G-20050714.svg/364px-Sudoku-by-L2G-20050714.svg.png


Found: /tmp/e3ecb020096d498abb988d672e97d3b5.png
Processing URL: https://3.bp.blogspot.com/-FOW29j5S9K4/Tj-lkE-QCiI/AAAAAAAAEPY/Tf3eDwSv-oM/s1600/100-sudoku-facil_Page_083.jpg


Found: /tmp/339db4a75daf4272aeec7100c9a78301.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/sudoku-templates-under-bergdorfbib-co-printable-irregular-sudoku.jpg


Found: /tmp/639ac4fdf0d9440f99f08497974efbdc.jpg
Processing URL: https://images.sftcdn.net/images/t_app-cover-l,f_auto/p/199a5a86-9b33-11e6-9416-00163ec9f5fa/867564893/super-sudoku-screenshot.jpg


Found: /tmp/10827faa62e04ab1a7884bf0fd87bf05.jpg
Processing URL: https://1.bp.blogspot.com/-LpJiTS9kGe0/V_-M73-XBpI/AAAAAAAAAtM/-xSq_l6xOKoLnaU-yGQrAIfqM4az58VDwCLcB/s1600/Sudoku-Hard-1.jpg


Found: /tmp/06f9f1a045794bb5bab1889ed711d5ef.jpg
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/sudoku-puzzles-free-sudoku-puzzles-printable-sudoku-4-per-page-blank.jpg


Found: /tmp/def5954a350c4ac0b4667f861f0a4568.jpg
Processing URL: http://4.bp.blogspot.com/-hVfVa5yOArE/TagYG4bwGJI/AAAAAAAAJ2M/F8Lpieq6VEc/s1600/diffidult+printable+sudoko.png


Found: /tmp/0afbaa00d4d84ea3a9ac49f134537386.png
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/04/sudoku-blank-karan-ald2014.png


Found: /tmp/92ddc3432700445e9ac615b1319d7198.png
Processing URL: https://printablesudokufree.com/wp-content/uploads/2019/05/sudoku-printable-free-medium-printable-sudoku-puzzle-1-my-printable-irregular-sudoku.jpg


Found: /tmp/ea31bfbdd77d448f81646cc53a76dace.jpg
Processing URL: https://imag.malavida.com/mvimgbig/download-fs/sudoku-com-28109-1.jpg


Found: /tmp/6067caf7cbd04ffc9826991d69a4a653.jpg
Processing URL: https://i.pinimg.com/736x/6b/47/db/6b47dbee1b92f5cbee33ea468a8bd90d.jpg


Found: /tmp/f9dda935645847d3b2197a82032f9f9b.jpg
Processing URL: https://i.redd.it/help-pls-stuck-v0-iu7azp5sxt4c1.jpeg


NEW: /tmp/aefa0e874fb446128db62d7861118f4e.jpeg
Processing URL: http://www.filebuzz.com/software_screenshot/full/1000_easy_sudoku-196718.jpg


Found: /tmp/813eddbc67f04d52ad0f1554286fc90f.jpg
Processing URL: https://sudokuprintables.com/wp-content/uploads/2020/04/supersamurai-gattai-13-sudoku-variants.png


In [68]:
    ds = ray.data.read_images("local:" + imageDir)

print(ds.schema())


2023-12-10 09:44:30,455	INFO plan.py:757 -- Using autodetected parallelism=16 for stage ReadImage to satisfy parallelism at least twice the available number of CPUs (8).


Column  Type
------  ----
image   numpy.ndarray(ndim=2, dtype=uint8)
