Dependencies

In [1]:
from PIL import Image
import pytesseract
import numpy as np
import cv2
import csv

Declare directories for all images in input folder

In [7]:
dir = './input/'
from os import listdir
filelist = sorted([f for f in listdir(dir) if f.lower().endswith('.png')])
filename = [dir + s for s in filelist]

Image processing & OCR

In [8]:
text = []
for count, value in enumerate(filename):
    img = cv2.imread(value)
    hsv0 = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
    #remove jabber bg
    jMIN = np.array([50, 28, 142],np.uint8)
    jMAX = np.array([87, 82, 207],np.uint8)
    maskJ = cv2.inRange(hsv0, jMIN, jMAX)
    img[maskJ>0] = (0,0,0) #using maskJ, make that area black
    hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)

    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret1,img = cv2.threshold(img,156,255,cv2.THRESH_BINARY) #global threshold

    #Alliance Legion to White
    aMIN = np.array([17, 137, 168],np.uint8)
    aMAX = np.array([19, 146, 220],np.uint8)
    maskA = cv2.inRange(hsv, aMIN, aMAX) #creates a mask of color range between acMIN and acMAX
    maskA[:,:669] = 0 #remove left
    maskA[:,749:] = 0 #remove right
    kernel = np.ones((2, 2), 'uint8')

    #Timer to White
    tMIN = np.array([16, 28, 122],np.uint8)
    tMAX = np.array([21, 71, 240],np.uint8)
    maskT = cv2.inRange(hsv, tMIN, tMAX)
    maskT[:,:636] = 0 #remove left
    maskT[:,780:] = 0 #remove right

    #Infernal to White
    iMIN = np.array([9, 122, 170],np.uint8)
    iMAX = np.array([29, 197, 255],np.uint8)
    maskI = cv2.inRange(hsv, iMIN, iMAX)
    maskI[:,:37] = maskI[:,77:671] = maskI[:,749:] = 0 #remove

    img[maskA>0] = img[maskT>0] = img[maskI>0] = 255 #add Alliance, Timer, and Infernal mask
    img[:805,:] = img[:,:35] = img[:,785:] = img[:,370:630] = 0 #clear upper, left, and right borde

    img = cv2.resize(img, (img.shape[1]*3,img.shape[0]*3))
    kernel = np.ones((4, 4), 'uint8')
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.bitwise_not(img) #Invert image to negative

    #Tesseract OCR
    custom_config = r'-l eng --psm 6 -c tessedit_char_whitelist="01234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz! "  --user-words words.txt --user-patterns patterns.txt'
    text.append(pytesseract.image_to_string(img, config=custom_config))


Dictionary List

In [9]:
import re

typeDict = (
'liance',
'Horde',
'ernal',
'L?egion',
'Persona'
)

durList = (
'[2ao]\s?hr',
'[6S]\s?hr',
'8\s?hr',
'12\s?hr',
'24\s?[hb]r',
'48\s?[hr][re]',
'150\s?hr',
'[7T]\s?day'
) # (\s?) is zero or 1 occurence of whitespace for regex 

nameDict = ('Complete Trials',
'Conquest Challenge',
'Damage Enemy Location',
'Defeat Armies',
'Defeat Foes',
'Defeat Units',
'Gain Champion XP',
'Gain Iron',
'Gain Power',
'Gain Resources',
'Raid Resources',
'Ritual Completion',
'Shard Defeat Foes',
'Train Units',
'Unleash Power',
'Unleash Resources',
'Warpstone Devourer',
'Defeat Dwarven Armies',
'Defeat Dwarven F[oa][egp]s',
'Defeat Empire Armies',
'Defeat Empire Foes',
'Defeat Greenskin Armies',
'Defeat Greenskin Foes',
'Defeat Greenskin [Ss]qu[if]gs',
'Damage Khorne Foes',
'Damage Nurgle Foes',
'Damage Slaneesh Foes',
'Damage Tzeentch Foes',
'Damage Undivided Foes',
'City Siege',
'Empire City Domination'
)

tRegex = (
'[0-9l]d\s?[0-5O]?[0-9]h\s?[0-5][0-9]m',
'[0-2]?[0-9l]?h?\s?[0-5][0-9]m\s?[0-5][0-9]s'
)

allDict = ('DOUBLE REWARDS',) + typeDict + nameDict + durList + tRegex

typeA = re.compile(r'(' + (r'|'.join(allDict)) +r')') #All
typeD = re.compile(r'(' + (r'|'.join(durList)) + r')') #Duration
typeN = re.compile(r'(' + (r'|'.join(nameDict)) + r')') #Name
typeT = re.compile(r'(' + (r'|'.join(tRegex)) + r')') #Timer

Cleaning data in arrays

In [10]:
#Results, Initialize arrays
sR = []
dR = []
tR = []
oD = []
count = 0

#DateTime
from datetime import datetime, timedelta

def get_date_taken(path):
    return Image.open(path)._getexif()[36867]

for count, value in enumerate(text):
    findA = typeA.findall(value)
    findA = [w.replace('ernal', 'Infernal')
        .replace('liance', 'Alliance')
        .replace('Persona', 'Personal')
        .replace('Location', 'Locations')
        .replace('Squfgs', 'Squigs') for w in findA]


    while re.match('Alliance|Horde|Infernal|Legion|Personal|DOUBLE REWARDS',findA[0]) == None: #while first element is not DR or typeDict, remove it
        del findA[0]
    while re.match("(" + ")|(".join(tRegex) + ")",findA[-1]) == None: #while last element is not Timer, remove it
        del findA[-1]

    Results = ' '.join(findA)
    sResults = typeD.sub('',Results) #remove all Duration using typeD from DurList
    sResults = re.sub('\s+',' ',sResults) #change double white space to 1
    sResults = re.sub('Fo[g|p]s','Foes',sResults) #change double white space to 1
    sResults = re.split("" + "|\s*".join(tRegex) + "\s*",sResults) #with brackets () outside the sq brackets[], the delimiter is captured/kept
    sR.append(sResults[0])
    sR.append(sResults[1])

    dResults = typeD.findall(Results)
    dResults = [sub.replace('7 day', '168') for sub in dResults] #convert 7day into 168
    dResults = [sub.replace('hr', '') for sub in dResults] #remove all hr
    dR.append(dResults[0])
    dR.append(dResults[1])

    tResults = typeT.findall(Results)
    tResults = [sub.replace('l', '1') for sub in tResults] #replace l with 1
    tResults = [sub.replace(' ', '') for sub in tResults] #remove all whitespace
    tR.append(tResults[0])
    tR.append(tResults[1])

    DTstr = get_date_taken(filename[count])
    DTobj = datetime.strptime(DTstr, '%Y:%m:%d %H:%M:%S')
    oD.append(DTobj)
    oD.append(DTobj)
sR = [x.strip(' ') for x in sR] #strip whitespace both ways

IndexError: list index out of range

Writes output.txt showing all texts after OCR

In [12]:
a=[]
for count, value in enumerate(text):
    findA = typeA.findall(value)
    a.append(findA)

with open("output.txt", "w") as fp:
    for item in a:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

Done


View output of all desired text

In [23]:
findA = typeA.findall(text[3])
findA

['8 hr',
 'Complete Trials',
 '1d 04h 51m',
 'Legion',
 '48 hr',
 'Defeat Armies',
 '8 hr',
 'Defeat Foes']

Date created detection using Python Imaging Library (Pillow)

In [None]:
import pytz
#start empty arrays
sT = []
eT = []
#Round off datetime
def round1s(t):
    # Rounds to nearest hour by adding a timedelta hour if minute >= 30
    return (t.replace(second=1, minute=0, hour=t.hour)
               +timedelta(hours=t.minute//30))

for count, value in enumerate(tR):
    if re.match(r'([0-2]?[0-9]h[0-5][0-9]m[0-5][0-9]s)',value)!=None:
        t = datetime.strptime(value, '%Hh%Mm%Ss')
    elif re.match(r'([0-9]d[0-2][0-9]h[0-5][0-9]m)',value)!=None:
        t = datetime.strptime(value, '%dd%Hh%Mm')
        t = t.replace(day = t.day+1) #+1 because h/m/s has 1d that can't be removed
    deltaS = timedelta(days=t.day-1,hours=t.hour, minutes=t.minute, seconds=t.second) #
    sTime = oD[count] + deltaS
    sTime = round1s(sTime)
    sT.append(sTime.isoformat()+'+01:00') #+01:00 to match Google Cal format

    deltaE = timedelta(hours=int(dR[count]))
    eTime = sTime + deltaE
    eTime = round1s(eTime)
    eT.append(eTime.isoformat()+'+01:00')

In [None]:
import pandas as pd
df = pd.DataFrame({'eventN': pd.Series(dtype='str'),
                   'startT': pd.Series(dtype='datetime64[ns]'),
                   'endT': pd.Series(dtype='datetime64[ns]')})

columns = list(df)
data = []
for i in range(len(sR)):
    values = [sR[i], sT[i], eT[i]]
    zipped = zip(columns, values)
    a_dictionary = dict(zipped)
    print(a_dictionary)
    data.append(a_dictionary)

df = df.append(data, True)
df = df.drop_duplicates()

Get Calendar List

In [None]:
from cal_setup import get_calendar_service
calID = 'insert_cal_ID'

data = []
dfCal = pd.DataFrame({'eventN': pd.Series(dtype='str'),
                    'startT': pd.Series(dtype='datetime64[ns]'),
                    'endT': pd.Series(dtype='datetime64[ns]'),
                    'eID': pd.Series(dtype='str')
                    })
columns = list(dfCal)

service = get_calendar_service()
# Call the Calendar API
print('Getting List of 100 events')
events_result = service.events().list(
    calendarId= calID,
    maxResults=100, singleEvents=True,
    orderBy='startTime').execute()
events = events_result.get('items', [])

if not events:
    print('No upcoming events found.')
for event in events:
    start = event['start'].get('dateTime', event['start'].get('date'))
    end = event['end'].get('dateTime', event['end'].get('date'))
    IDe = event['id']
    dfTemp = [event['summary'], start, end, IDe]
    zipped = zip(columns, dfTemp)
    a_dictionary = dict(zipped)
    data.append(a_dictionary)

dfCal = dfCal.append(data, True) #append data to dfCal

Merge arrays while dropping duplicates

In [None]:
dfNew = pd.concat([df,dfCal]) #concat old and input(new) events
dfNew.drop_duplicates(subset=['eventN','startT','endT'],keep=False, inplace=True) #Drop duplicate events
dfNew[dfNew['eID'].isnull()] #drop events already on Google Calendar

Create Event

In [None]:
from cal_setup import get_calendar_service

for index, count in dfNew.iterrows():
    event_result = service.events().insert(calendarId=calID,
        body={
            "summary": count['eventN'],
            "description": '',
            "start": {"dateTime": count['startT'], "timeZone": 'Europe/London'},
            "end": {"dateTime": count['endT'], "timeZone": 'Europe/London'},
        }
    ).execute()

Delete duplicate events from Google Cal

In [None]:
dfDup = dfCal[dfCal.duplicated(subset=['eventN','startT','endT'])]
for index, count in dfDup.iterrows():
    service.events().delete(calendarId=calID, eventId=count['eID']).execute()

Delete Past Events

In [None]:
now = datetime.now()
current_time = now.strftime("%Y-%m-%dT%H:%M:%S+01:00")
for index, count in dfCal.iterrows():
    if current_time > count['endT']:
        service.events().delete(calendarId=calID, eventId=count['eID']).execute()