### 2 - Detect text in images and save as json

#### Add json credential file

In [1]:
import pandas as pd
import json
import os
import io
from shutil import copyfile
import re
import itertools
from google.cloud import vision
from datetime import datetime
import numpy as np
from google.protobuf.json_format import MessageToDict
pd.set_option('display.max_rows', 10)

### Google vision credentials

In [2]:
cred_file = '' # LINK TO JSON CREDENTIAL FILE
with open(cred_file) as f:
    creds = json.load(f)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_file

### Detect text in images

In [3]:
def detect_text(path):
    '''use Google vision API to detect text from image
    only returns raw text, not coordinates of text content'''
    client = vision.ImageAnnotatorClient()
    with io.open(path, 'rb') as image_file:
        content = image_file.read()
    image = vision.types.Image(content=content)
    response = client.text_detection(image=image)
    response = MessageToDict(response, preserving_proto_field_name = True)
    try:
        response = response['full_text_annotation']['text']
    except:
        response = ''
    return response

### For each mill: detect text in images and save content as json

In [4]:
def save_content(mill, ids=None):
    '''detect and save text from images as json
    if ids are specified, only save image content corresponding to these IDs
    otherwise save content for all images of this mill'''
    directory = '../Images/weight_receipt'
    mill_directory = directory +'/'+mill
    if mill+'_text_contents.json' in os.listdir(directory):
        with open(directory+'/'+mill+'_text_contents.json') as f:
            text_contents = json.load(f)
    else:
        text_contents = {}
    all_images = [file for file in os.listdir(mill_directory) if file.endswith('.jpg')]
    if ids:
        all_images = [image for image in all_images if image.split('_')[0] in ids]
    for image in all_images:
        if image.endswith('weight_receipt.jpg'):
            image_list = image.split('_')
            ID = image_list[0]
            if ID not in text_contents.keys():
                print('detecting text content of image', ID)
                text_image = detect_text(mill_directory+'/'+image)
                text_contents[ID] = text_image
            else:
                print('image', ID, 'already processed')
    with open(directory+'/'+mill+'_text_contents.json', 'w') as outfile:
        json.dump(text_contents, outfile)
    print('Done\n')
    return text_contents

### Run image recognition algorithm

In [5]:
def run_recognition(mill, ids=None):
    '''run image recognition algorithm and save content in json format'''
    directory = '../Images/weight_receipt/'
    all_images = [file for file in os.listdir(directory+mill) if file.endswith('.jpg')]
    if not ids:
        ids = [image.split('_')[0] for image in all_images]
    res_all = save_content(mill, ids)

#### Run recognition for five chosen mills: arvena, bss, nhr, skip and srjnad

In [1]:
folders = ['arvena', 'bss', 'nhr', 'skip', 'srjnad']
for mill in folders:
    print('Processing mill '+mill)
    path = '../Images/weight_receipt/'+mill
    files = os.listdir(path)
    files = [file for file in files if file.endswith('.jpg')]
    ids = [file.split('_')[0] for file in files]
    run_recognition(mill, ids)

### Save date and time when images were uploaded to the platform

In [13]:
path = '../Images/weight_receipt'
mills = ['arvena', 'bss', 'nhr', 'skip', 'srjnad']
mills = [path+'/'+mill for mill in mills]
cols = ['ID', 'Date created', 'Time created']
df_upload = pd.DataFrame([], columns=cols)
for mill in mills:
    files = os.listdir(mill)
    files = [file for file in files if file.endswith('.jpg')]
    df = pd.DataFrame([], columns=cols)
    df['ID'] = [int(file.split('_')[0]) for file in files]
    dates = [file.split('_')[3] for file in files]
    df['Date created'] = [datetime.strptime(date, '%y%m%d').strftime('%m/%d/%Y') for date in dates]
    times = [file.split('_')[4] for file in files]
    df['Time created'] = [datetime.strptime(time, '%H%M').strftime('%H:%M') for time in times]
    df_upload = pd.concat([df_upload, df], sort=False)
df_upload.to_csv(path+'/'+'date_time_upload.csv')