In [1]:
from PIL import Image, ImageDraw
import string
import random
import os
from pdf2image import convert_from_path 
import numpy as np
from scipy import ndimage
import matplotlib.pyplot as plt
from fpdf import FPDF
import pytesseract


In [60]:
def make_shadow(image):
    n = np.random.randint(2, 6)
    shadow = np.zeros((image.size[1], image.size[0]))
    
    pointsx = image.size[0]*np.random.random(n)
    pointsy = image.size[1]*np.random.random(n)
    
    shadow[(pointsy).astype(np.int), (pointsx).astype(np.int)] = 1
    shadow = ndimage.gaussian_filter(shadow, sigma=image.size[0] / (2 * n))
    
    image_arr = np.array(image.convert('LA'))[:, :, 0]
    image_arr = np.where(image_arr > 200, 255, 0)
    image_arr = np.where(image_arr == 255, image_arr - (shadow - np.min(shadow)) * 200 / np.max(shadow), image_arr)

    return Image.fromarray(np.uint8(image_arr))

In [93]:
def make_bezier_shadow(image):
    shadow = np.zeros((image.size[1], image.size[0]))
    n_curves = np.random.randint(2, 4)
    
    for i in range(n_curves):
        n_dots = np.random.randint(5, 9)
        
        pointsx = image.size[0]*np.random.random(n_dots)
        pointsy = image.size[1]*np.random.random(n_dots)
        
        curve1 = bezier.Curve(np.asfortranarray([pointsx, pointsy]), degree=n_dots-1)

        shadow_points = curve1.evaluate_multi(np.linspace(0.0, 1.0, 100))
        shadow[(shadow_points[1]).astype(int), (shadow_points[0]).astype(int)] = 1
    
    shadow = ndimage.gaussian_filter(shadow, sigma=image.size[0] / (2 * n_dots))

    image_arr = np.array(image.convert('LA'))[:, :, 0]
    image_arr = np.where(image_arr > 200, 255, 0)
    image_arr = np.where(image_arr == 255, image_arr - (shadow - np.min(shadow)) * 200 / np.max(shadow), image_arr)

    return Image.fromarray(np.uint8(image_arr))
    

In [101]:

def make_samples(out_dir='', n_samples=100, inf=None):

    if len(out_dir) == 0:
        out_dir = os.getcwd()
    os.chdir(out_dir)
    
    try:
        os.mkdir('img')
    except:
        pass
    try:
        os.mkdir('txt')
    except:
        pass
    
    aligns = ['L', 'C', 'R']
    fonts = ['Courier', 'Arial', 'Times']
    styles = ['', 'B', 'I', 'U']
    
    for i in range(n_samples):
        pdf = FPDF() 
        pdf.add_page()
        
        font = random.choice(fonts)
    
        target = ''
        
        lines = random.randint(15, 25)
        for _ in range(lines):
            
            fontsize = random.randint(10,16)
            pdf.set_font(font, size=fontsize, style=random.choice(styles)) 

            str_size = random.randint(20,50)
            randomstr = ''.join(random.choices(string.ascii_letters+string.digits + ' ' * 10, k=str_size))
            target += randomstr + '\n'
        
            pdf.cell(w=0, h=10, txt=randomstr, ln=1, align=random.choice(aligns)) 
        
        pdf.output('temp1.pdf')    
        image = convert_from_path('temp1.pdf', poppler_path=r'/usr/local/Cellar/poppler/21.03.0_1/bin')[0]
        image = image.resize((1000, int(image.size[1] * 1000 / image.size[0])))

        image = make_bezier_shadow(image)
        image_path = 'img/' + str(i)+ '.jpg'
        image.save(image_path, 'JPEG')
        
        target_path = 'txt/' + str(i)+ '.txt'
        text_file = open(target_path, "w")
        text_file.write(target)
        text_file.close()
        
        if inf is not None and i % 10 == 0:
            print('progress {:0.2f}'.format(i * 100/ n_samples), '%')
    
    os.remove('temp1.pdf')

In [102]:
make_samples(n_samples=5, inf=1)

progress 0.00 %


In [119]:
import re
s = re.sub('\n+', '\n', pytesseract.pytesseract.image_to_string(Image.open('img/0.jpg')))
s = re.sub('\n ', '', s)
print(s)

E351HLS 532} 2JwWLO6JD211cV8
pvoxXyybOvANFizuEGWDTfm2
© GSHYVWVqLWx19wj148Ibr
x9AWVMd6 ULd uXWelamTK Bn2YoN7ZTKMvsW zWG
mvRNS HuoN xKC cCybj3d
9NOMP FeZA jQJVUwLM pSEqyz8P
O9DbUxIn OcriCz dwSzm8ykuue
VLtkP. TIYXq5IzLualB3 Ba
urS r1P w 6UMpGIr3 zinc d29ud uF77 zHw0kh m
EZ1 EXCS j             
gmMqI6XM3I 1 G 8fzKmkoCmft h5KpK1Di
Ei 1x42 3 xU62uE SUEO 9240
jP Sb92Z82HOwlf yEUxWIR7INT
genhul KotnI sXg 
L
Veh Pl r SzV 7JmuMel ahInAO r o
yJpjG6 cH Si MtLX47h
WO JBHHab oDYd2n0y
mrt ig EaaStys
TPtw8jPaklTegiH CWFfdyyKWilrnJ aK TSyF EAHag
KL w2FpRtUTIVK B TL xlwH1
w hl4g4ius bc 2 did    
SIS DODAKUAOkCPSOQOn6!
rLVyE34K rvvloknxv4
gmam2UAT@vVxnIyJQ VM3BK Uvé
HHTKaeol mLp B hgF KsGloDclCNhU 9 PW
t9av 6 RGaB23n haGjcCi

