In [1]:
import numpy as np
import os
import io
from PIL import Image
import pytesseract
import csv

from pytesseract import Output
import pandas as pd

# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

path_scans = os.path.join(
    repo_path, 'files', 'scanned-data'
)

file_paths = []
for dirpath, _, filenames in os.walk(path_scans):
    for filename in filenames:
        if filename.endswith(':Zone.Identifier'):
            # A convenience hack for Windows subsystem for linux
            continue
        if filename.startswith('.'):
            # A convenience hack skipping other file system cruft
            continue
        if not filename.endswith('.jpeg') and not filename.endswith('.jpg'):
            continue
        file_path = os.path.join(dirpath, filename)
        file_paths.append(file_path)

        
num_orig = len(file_paths)
print(f'We have {num_orig} files to process')


We have 15 files to process


In [2]:
output_format = 'png'
i = 0
for file_path in file_paths:
    i += 1
    new_path = file_path.replace('/files/scanned-data/', '/files/scanned-data-csvs/')
    new_dir = os.path.dirname(new_path)
    new_file = new_path.replace('.jpeg', '').replace('.jpg', '') + '.csv'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    if os.path.exists(new_file):
        continue
    print(f'[{i} of {num_orig}] working on {file_path}')
    img_rgb = Image.open(file_path)
    custom_config = r'-l eng --oem 3 --psm 6'
    d = pytesseract.image_to_data(img_rgb, config=custom_config, output_type=Output.DICT)
    df = pd.DataFrame(d)
    # clean up blanks
    df1 = df[(df.conf != '-1') & (df.text != ' ') & (df.text != '')]
    sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
    for block in sorted_blocks:
        curr = df1[df1['block_num'] == block]
        sel = curr[curr.text.str.len() > 1]
        # sel = curr
        char_w = (sel.width / sel.text.str.len()).mean()
        prev_par, prev_line, prev_left = 0, 0, 0
        text = ''
        for ix, ln in curr.iterrows():
            # add new line when necessary
            if prev_par != ln['par_num']:
                text += '<br/>'
                prev_par = ln['par_num']
                prev_line = ln['line_num']
                prev_left = 0
            elif prev_line != ln['line_num']:
                text += '<br/>'
                prev_line = ln['line_num']
                prev_left = 0

            added = 0  # num of spaces that should be added
            if ln['left'] / char_w > prev_left + 1:
                added = int((ln['left']) / char_w) - prev_left
                text += ' ' * added
            text += ln['text'] + ' '
            prev_left += len(ln['text']) + added + 1
        text += '<br/>'
        print(text)
        # Now split the lines!
        lines = text.split('<br/>')
        csv_data = [line.split() for line in lines]
        with open(new_file, 'w', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerows(csv_data)

        
    

[1 of 15] working on /home/ekansa/github/open-context-jupyter/files/scanned-data/color-box-4/scan-2-box-4.jpeg
<br/>           Box  4 cont. <br/>           AD.      74  - BS6 - plaster  recess,  close-up  from  S, scale  20 cn. <br/>           48,      73  - BS? -     :       "   interior,  from  SE.              . <br/>           49,      "3 os B58 =     tt      '"     it       W    SW.              / <br/>           506      74  - B59 -     "  floor  of room  east end,  SF 536  flint blades, <br/>                                horn  core,  from W. <br/>           D1.      74  - B6O -  room with  partition  removed,  plaster  floor 103, <br/>                                 from E. <br/>           Bo.      74  - B6l -  floor 103,  post-hole  110, from  SW. <br/>           236      73  — B62 <br/>           os       73  — BGS <br/>            DO.     74  - B64 -  feature  in SE corner  of plaster  floor,  from NW, <br/>                                 scale 20  cm. <br/>            56

<br/>             Box  4 cont, <br/>             101.      72 - E102  - as E100,  slightly darker. <br/>             102.      72 - E106  - whole trench,  from SE <br/>             1034.     72 ~ E107  - as E106,  darker. <br/>             104.      72 - plan  of earlier  buildings. <br/>             105.      72 - EZ  - 9 - animal  bones SF 80  between walls  19,24 from N. <br/>             106.      73 - ES  - 10 - close-up  of animal  bones SF 80  and assoc. <br/>                                       artifacts,  from N,  scale 10 cm. <br/>             LO? «     72 - E2  - 5 - pits 9,  13, 7, 16  level 11 between,  from N. <br/>             108.      73 - Ee  - 7 - pits 9,  24, plaster  floors and walls  below <br/>                                      level  19, from S, scale  50 cm. <br/>             109.      73 - B2  - 9 - Neolithic  burial level  57, from NNE,  scale <br/>                                      20 cm.  73. /36, 73, fT <br/>             dos       73 - E2  - 10 - u

<br/>          Box  6              Trench E <br/>          Le      73-B2-19      Main building  from NE. <br/>          ae      73-E2-20        =      7    and others  from SE. <br/>          3»      7 3-H2-20     as  above,  lighter <br/>          4,      75-E2-21     ox scapula  bottom level  95, scale  50 cm. <br/>          De      73-E2-22      Neolithic  burial level  120 under  wall 12 west  end <br/>                                of trench,  from NE,  scale 20 cm. <br/>          6.      735-H2-23     burial  level 120, from  W, scale  20 cm. <br/>          Ze      73-He-25      burial level  120, skull  with fling  in, from SW, <br/>                                scale 5  cm. <br/>          Bs      75-E2-26      burial level  120 with  horn core,  stone ball, from <br/>                                S, scale  10cm. <br/>          9.      73-E2-24      lines on mud-brick  surface  below  level 113, from <br/>                                SE, scale  20 cn. <br/>          10. 

<br/>              Box 5 <br/>              ls      72 - C4  - skeleton  SF21  level  16, from  SE, scale  20 cm. <br/>              Ze      72 - Cle  - pie       SF  86, necklace  of  beads,  from S,  scale <br/>                                   O  cm. <br/>              Bin     92 - Cl4  - walls  1, 2  from SE  corner. <br/>              4,      72 -  C17? -   "   "    " & poss. wall frag.  from  SW corner,  scale <br/>                                  1m. <br/>              Si      72 -  C20 ~ burials  from  NE corner,  scale  ln. <br/>              6.      92 -  C27 - wall  6, from  E. <br/>              os      22 _  C29 _   wf  "    "   W. <br/>              8.      73 -  C9/10 -  floor 5  with stone  tools,  from N,  scale  50 cm. <br/>              9.      74 —- Cll - walls  in  eastern  extension,  from NW,  scale  lm. <br/>              10.     73 -  Cl2 - walls  A, B,  C, skeletons   85, 93,  from NW. <br/>              ll.     7% =  C16 - skeletons   52, 89  level 9,  from

<br/>              Bex   ( <br/>                                                                                      ae <br/>            3h    1973, Central ridge,  trench F and village  from trench G.              . <br/>            32     0    Trench B  and house from trench  A. <br/>            33    1972, House and  back slope of central  ridge looking north. <br/>           34     1973, House on  mound, close-up from  west. <br/>            35     "             ditto <br/>           36      u    Roof inside  house on mound, newly  laid.                         ‘ <br/>            at     "             ditto <br/>           38      "    Second roof  inside house on mound,  newly laid. <br/>            39     i             ditto <br/>           4.0    1972, Trench C  from trench B. <br/>           Al      "    Trench C  and trench D from trench  B. <br/>           42      "    Trench B  from trench CG. <br/>           43     1973, Trench D  from foot of mound. <br/>           AA     

<br/>           Box 2 <br/>           o«       74 - Tina McGeorge  with  skull. <br/>           52.      73 -   a     a    cleaning  skull. <br/>           54.      73 -   "     "    measuring  nose of skull. <br/>           Oe       woo   i"     "        "    jaw. <br/>           Ob.      74 - Thanksgiving  party:  Barbara, George  (garage),  Betty, <br/>                         Debby,  Charlie, Tom,  Hillal, Cathy,  Benj, Diana,    : <br/>                         Cliff,  Val, Colin,  self. <br/>           56.      74 - The same,  less Hillal. <br/>            7) 3    73 - Betty,  Debby, Charlie. <br/>           58.      74 - Debby,  Charlie. <br/>           59.      73 - Charlie,  Tom, Kathy. <br/>           Gis      743 - Val, Tony Allen, Colin. <br/>           6e.      72 - Barbara,  donkey, Bos  Prim. skull. <br/>           63.      72 - Donkey  nuzzling Barbara. <br/>           64.      72 - Barbara  and Donkey. <br/>           65.      72 - Barbara,  Mohammed K's  wife rolling  