In [14]:
from PIL import Image, ImageDraw
import string
import random
import os
from pdf2image import convert_from_path 
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
from fpdf import FPDF
import pytesseract
import bezier


In [20]:
def make_shadow(image):
    n = np.random.randint(2, 6)
    shadow = np.zeros((image.size[1], image.size[0]))
    
    pointsx = image.size[0]*np.random.random(n)
    pointsy = image.size[1]*np.random.random(n)
    
    shadow[(pointsy).astype(np.int), (pointsx).astype(np.int)] = 1
    shadow = ndimage.gaussian_filter(shadow, sigma=image.size[0] / (2 * n))
    
    image_arr = np.array(image.convert('LA'))[:, :, 0]
    image_arr = np.where(image_arr > 200, 255, 0)
    image_arr = np.where(image_arr == 255, image_arr - (shadow - np.min(shadow)) * 200 / np.max(shadow), image_arr)

    return Image.fromarray(np.uint8(image_arr))

In [237]:
def make_bezier_shadow(image):
    shadow = np.zeros((image.size[1], image.size[0]))
    n_curves = np.random.randint(2, 4)
    
    for i in range(n_curves):
        n_dots = np.random.randint(5, 9)
        
        pointsx = image.size[0]*np.random.random(n_dots)
        pointsy = image.size[1]*np.random.random(n_dots)
        
        curve1 = bezier.Curve(np.asfortranarray([pointsx, pointsy]), degree=n_dots-1)

        shadow_points = curve1.evaluate_multi(np.linspace(0.0, 1.0, 100))
        shadow[(shadow_points[1]).astype(int), (shadow_points[0]).astype(int)] = 1
    
    s = np.random.uniform(0.5, 0.9)
    shadow = ndimage.gaussian_filter(shadow, sigma=image.size[0] / (s * n_dots)).T
    
    darkness =  np.random.uniform(0.7, 0.85)
    shadow = 1 - darkness * (shadow - np.min(shadow)) / np.max(shadow)

    pixels = image.load()
    for x in range(image.size[0]):
        for y in range(image.size[1]):
            pixels[x, y] =  (int(pixels[x, y][0] * shadow[x, y]),
                             int(pixels[x, y][1] * shadow[x, y]),
                             int(pixels[x, y][2] * shadow[x, y]))
                        
    return image
    

In [244]:
make_samples(n_samples=1, inf=1)

progress 0.00 %


In [243]:

def make_samples(out_dir='', n_samples=100, inf=None):

    if len(out_dir) == 0:
        out_dir = os.getcwd()
    os.chdir(out_dir)
    
    try:
        os.mkdir('img')
    except:
        pass
    try:
        os.mkdir('txt')
    except:
        pass
    
    aligns = ['L', 'C', 'R']
    fonts = ['Courier', 'Arial', 'Times']
    styles = ['', 'B', 'I', 'U']
    
    for i in range(n_samples):
        pdf = FPDF() 
        pdf.add_page()
        
        font = random.choice(fonts)
    
        target = ''
        
        lines = random.randint(15, 25)
        for _ in range(lines):
            
            fontsize = random.randint(20,30)
            pdf.set_font(font, size=fontsize, style=random.choice(styles)) 
            r = random.randint(0, 50)
            g = random.randint(0, 50)
            b = random.randint(0, 50)

            pdf.set_text_color(r, g, b)

            str_size = random.randint(20,30)
    

            
            randomstr = ''.join(random.choices(string.ascii_letters+string.digits + ' ' * 20, k=str_size))
            target += randomstr + '\n'
        
            pdf.cell(w=0, h=10, txt=randomstr, ln=1, align=random.choice(aligns)) 
        
        pdf.output('temp1.pdf')    
        image = convert_from_path('temp1.pdf', poppler_path=r'/usr/local/Cellar/poppler/21.04.0/bin')[0]
        image = image.resize((680, int(image.size[1] * 680 / image.size[0])))
        
        image_path = 'gr/' + str(i)+ '.jpg'
        image.save(image_path, 'JPEG')


        image = make_bezier_shadow(image)
        image_path = 'img/' + str(i)+ '.jpg'
        image.save(image_path, 'JPEG')
        
        target_path = 'txt/' + str(i)+ '.txt'
        text_file = open(target_path, "w")
        text_file.write(target)
        text_file.close()
        
        if inf is not None and i % 10 == 0:
            print('progress {:0.2f}'.format(i * 100/ n_samples), '%')
    
    os.remove('temp1.pdf')

In [249]:
import Levenshtein

In [253]:
import re
s = re.sub('\n+', '\n', pytesseract.pytesseract.image_to_string(Image.open('img/0.jpg')))
s = re.sub('\n ', '', s)
print(s)

   
K5 ch9YmI 171
PN sQ E 0s SPabs7 maOJ pl
DCS D9FymSZE m dYA3W v
cWy3y SmeWgDnX h6t DgAUD hMbl7
yy DuTj0 yysIK aeywFi869
zSiWj6B 1DeGjlMp 0 wTNyzng u



In [251]:
import re
clear = re.sub('\n+', '\n', pytesseract.pytesseract.image_to_string(Image.open('gr/0.jpg')))
clear = re.sub('\n ', '', s)
print(s)

Gbb957eJk L 0 QJIWVEI
1jl AUNWPECXEAQ 1h
3qH Mv 6r2 9Q4 kuollgie
IWIFFUWQRNIYLX gk GC HaPQv
Gnx LvB ARQidbClIsjdAE4uvY
IRI tYEVwxCg uliP
sm MKU 6uws3B 4 9tzBS xI01
L76tiWAjz4ljfkK k8D IqUa
YF2 SIoE7 a OMyNEAehTI a
kG Gu CJzS ZDV4 X55cBecm
n2BC uMBBYvjv Iyz j4 1h
EM eDUm 915 ToU xaD
oleDv6 LH wtGQsny 7 1
370 RHwX fMrMPk w30N 9 Dwl
hQ 6Q6mZ97J50 bxk6
K5 ch9YmI 171H e7nkHU
PN sQ E 0s SPabs7 maOJ pE
DCS DO9FymSZE m dYA3W v
cWy3y SmeWgDnX h6t DgAUD hMbl7
yy DuTj0 yysIK aeywFi869
zSiWj6B 1DeGjlMp 0 wTNyzng u



In [258]:
truth = ''
f = open("txt/0.txt")
gr = ''.join(f.readlines())
gr = re.sub('  ', ' ', gr)
gr = re.sub('  ', ' ', gr)


print(gr)

 Gbb 9 57eJ k L 0 QJ1WVF l 
I j1 AUNWpECXEAQ l h
3qH Mv 6r2 9Q4 kuoIlqie
 9WJFfUwQRNJYLX gk GC HaPQv
Gnx LvB ARQidbCIsjdAE4uvY
lRI tYEVwxCg uliP 
sm MKU 6uws3B 4 9tzBS xI0l
 L76tiWAjz4IjfkK k8D IqUa
YF2 SIoE7 a 0MyNEAehTl a
kG Gu CJzS ZDV4 X55cBcm
 n2BC uMBBvjv Iyz j4 1h
EM eDUm 9r5 ToU xaD 
o IeDv6 LH wtGQsny 7 1
3 ZO RHwX fMrMPk w30N 9 Dwl
 hQ 6Q6mZ97J5O bxk6 
 K5 ch9YmI l71H e7nkHU
PN sQ E 0s SPabs7 maOJ pE
DCS D9FymSZE m dYA3W v
cWy3y SmeWqDnX h6t DqAUD hMbl7
 yy DuTj0 yyslK aeywFi869 
zSiWj6B 1DeGjlMp 0 wTNyzng u



In [259]:
Levenshtein.distance(gr, s)

378

In [260]:
Levenshtein.distance(gr, clear)

53