In [None]:
import cv2
import os
import pytesseract
from fuzzywuzzy import fuzz

import download
from extract import unsharp, process_text
import validations

In [None]:
rate_list = []
pset = set()
proceed = set()

if os.path.isfile("scores.txt"):
    f = open("scores.txt", "r")
    lines = f.readlines()
    lines = [l.split(' ') for l in lines]
    rate_list = [(l[0], int(l[1]), int(l[2]), int(l[3])) for l in lines]
    pset = set(r[0] for r in rate_list)
    proceed = set(pset)

In [None]:
filters = ['h_600', 'h_800', '*2', '*3', '*4', '*5', '*6', 'gray', 'unsharp', 'thresh', 'blur', '']
filters = [f if len(f) == 0 else f+':' for f in filters]
def is_scale(p):
    return p.startswith('h_') or p.startswith('*')

In [None]:
params = []
#pset = set()
for f1 in filters:
    for f2 in filters:
        if f2 != '' and f2 == f1: continue
        if is_scale(f1) and is_scale(f2): continue
        for f3 in filters:
            if f3 == f1 or f3 == f2: continue
            if is_scale(f1) and is_scale(f3): continue
            if is_scale(f2) and is_scale(f3): continue
            val = f1+f2+f3
            if val in pset: continue
            pset.add(val)
            params.append(val)

In [None]:
len(params)

In [None]:
def estimate(ps, video_id, frame, text):
    im = download.get_frame(video_id, frame)
    for p in ps.split(':'):
        if len(p) == 0: continue
        if p.startswith('h_'):
            target_height = int(p[2:])
            h = im.shape[0]
            w = im.shape[1]
            scale = target_height / h
            im = cv2.resize(im, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        elif p.startswith('*'):
            scale = int(p[1:])
            im = cv2.resize(im, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        elif p == 'gray':
            im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        elif p == 'unsharp':
            im = unsharp(im)
        elif p == 'thresh':
            im = cv2.threshold(im, 150, 255, cv2.THRESH_BINARY)[1]
        elif p == 'blur':
            im = cv2.medianBlur(im, 3)
    tess_text = pytesseract.image_to_string(im)
    tess_text = process_text(tess_text)
    return fuzz.ratio(text.lower(), tess_text.lower())

In [None]:
best = None
best_score = 0
#rate_list = []

for p in params:
    if p in proceed: continue
    total = 0
    score_min = 101*len(validations.data)
    score_max = -1
    for d in validations.data:
        score = estimate(p, **d)
        if score > 0:
            total += score
        score_min = min(score_min, score)
        score_max = max(score_max, score)
    total = total // len(validations.data)
    print(p, total, score_min, score_max)
    if total > best_score:
        best = p
        best_score = total
    if total > 0:
        rate_list.append((p, total, score_min, score_max))

    f = open("scores.txt", "a")
    f.write("%s %d %d %d\n" % (p, total, score_min, score_max))
    f.close()
    proceed.add(p)

Top best avg score

In [None]:
sorted(rate_list, key=lambda r: -r[1])[:10]

Top best min score

In [None]:
sorted(rate_list, key=lambda r: (-r[2], -r[1]))[:10]