In [1]:
import cv2
import os
import pytesseract
from fuzzywuzzy import fuzz

import download
from extract import unsharp, process_text
import validations

In [2]:
rate_list = []
pset = set()
proceed = set()

if os.path.isfile("scores.txt"):
    f = open("scores.txt", "r")
    lines = f.readlines()
    lines = [l.split(' ') for l in lines]
    rate_list = [(l[0], int(l[1]), int(l[2]), int(l[3])) for l in lines]
    pset = set(r[0] for r in rate_list)
    proceed = set(pset)

In [3]:
filters = ['h_600', 'h_800', '*1', '*2', '*3', '*4', '*5', '*6', 'gray', 'unsharp', 'thresh', 'blur', '']
filters = [f if len(f) == 0 else f+':' for f in filters]
def is_scale(p):
    return p.startswith('h_') or p.startswith('*')

In [4]:
params = []
#pset = set()
for f1 in filters:
    for f2 in filters:
        if f2 != '' and f2 == f1: continue
        if is_scale(f1) and is_scale(f2): continue
        for f3 in filters:
            if f3 == f1 or f3 == f2: continue
            if is_scale(f1) and is_scale(f3): continue
            if is_scale(f2) and is_scale(f3): continue
            val = f1+f2+f3
            if val in pset: continue
            pset.add(val)
            params.append(val)

In [5]:
len(params)

400

In [6]:
def estimate(ps, video_id, frame, text):
    im = download.get_frame(video_id, frame)
    for p in ps.split(':'):
        if len(p) == 0: continue
        if p.startswith('h_'):
            target_height = int(p[2:])
            h = im.shape[0]
            w = im.shape[1]
            scale = target_height / h
            im = cv2.resize(im, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        elif p.startswith('*'):
            scale = int(p[1:])
            im = cv2.resize(im, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        elif p == 'gray':
            im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        elif p == 'unsharp':
            im = unsharp(im)
        elif p == 'thresh':
            im = cv2.threshold(im, 150, 255, cv2.THRESH_BINARY)[1]
        elif p == 'blur':
            im = cv2.medianBlur(im, 3)
    tess_text = pytesseract.image_to_string(im)
    tess_text = process_text(tess_text)
    return fuzz.ratio(text.lower(), tess_text.lower())

In [7]:
best = None
best_score = 0
#rate_list = []

for p in params:
    if p in proceed: continue
    total = 0
    score_min = 101*len(validations.data)
    score_max = -1
    for d in validations.data:
        score = estimate(p, **d)
        if score > 0:
            total += score
        score_min = min(score_min, score)
        score_max = max(score_max, score)
    total = total // len(validations.data)
    print(p, total, score_min, score_max)
    if total > best_score:
        best = p
        best_score = total
    if total > 0:
        rate_list.append((p, total, score_min, score_max))

    f = open("scores.txt", "a")
    f.write("%s %d %d %d\n" % (p, total, score_min, score_max))
    f.close()
    proceed.add(p)

[youtube] Extracting URL: https://www.youtube.com/watch?v=pQM9H_hf7a4
[youtube] pQM9H_hf7a4: Downloading webpage
[youtube] pQM9H_hf7a4: Downloading ios player API JSON
[youtube] pQM9H_hf7a4: Downloading android player API JSON
[youtube] pQM9H_hf7a4: Downloading m3u8 information
[info] pQM9H_hf7a4: Downloading 1 format(s): 18
[download] Destination: download\pQM9H_hf7a4.mp4
[download] 100% of  253.39MiB in 00:01:16 at 3.32MiB/s     
h_600:gray:unsharp: 32 32 33
h_600:gray:thresh: 25 0 38
h_600:gray:blur: 26 14 32
h_600:gray: 32 31 33
h_600:unsharp:gray: 32 32 33
h_600:unsharp:thresh: 15 0 31
h_600:unsharp:blur: 73 71 75
h_600:unsharp: 54 33 66
h_600:thresh:gray: 0 0 0
h_600:thresh:unsharp: 0 0 0
h_600:thresh:blur: 0 0 0
h_600:thresh: 0 0 0
h_600:blur:gray: 32 32 32
h_600:blur:unsharp: 73 71 77
h_600:blur:thresh: 0 0 0
h_600:blur: 54 51 58
h_800:gray:unsharp: 41 33 54
h_800:gray:thresh: 7 0 22
h_800:gray:blur: 32 32 33
h_800:gray: 33 33 33
h_800:unsharp:gray: 41 33 56
h_800:unsharp:thres

Top best avg score

In [8]:
sorted(rate_list, key=lambda r: -r[1])[:10]

[('unsharp:h_800:blur:', 79, 72, 92),
 ('h_600:unsharp:blur:', 73, 71, 75),
 ('h_600:blur:unsharp:', 73, 71, 77),
 ('h_800:unsharp:blur:', 73, 73, 74),
 ('h_800:blur:unsharp:', 72, 72, 73),
 ('gray:*3:blur:', 72, 63, 78),
 ('unsharp:blur:*2:', 72, 68, 75),
 ('*3:gray:blur:', 71, 65, 77),
 ('*4:blur:gray:', 71, 63, 77),
 ('*5:unsharp:blur:', 71, 68, 74)]

Top best min score

In [9]:
sorted(rate_list, key=lambda r: (-r[2], -r[1]))[:10]

[('h_800:unsharp:blur:', 73, 73, 74),
 ('unsharp:h_800:blur:', 79, 72, 92),
 ('h_800:blur:unsharp:', 72, 72, 73),
 ('h_600:unsharp:blur:', 73, 71, 75),
 ('h_600:blur:unsharp:', 73, 71, 77),
 ('*5:', 71, 70, 74),
 ('blur:unsharp:*4:', 71, 69, 73),
 ('*1:unsharp:blur:', 70, 69, 72),
 ('unsharp:*1:blur:', 70, 69, 72),
 ('unsharp:blur:*1:', 70, 69, 72)]