In [26]:
import moviepy as mp
import pytesseract
import cv2
import numpy as np
import matplotlib.pyplot as plt
import warnings

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def get_monochrome(img):
    return np.mean(img.astype(float), axis=2) / 255

def isolate_window(img):
    ixx = np.expand_dims(np.arange(img.shape[1]), 0)
    ixy = np.expand_dims(np.arange(img.shape[0]), 1)
    window_filt = ((ixy > (ixx * (-1/120)) + 169.5)
                   & (ixy < (ixx * (-16)) + 27995)
                   & (ixy < 1035)
                   & (ixy < (ixx * 12.75) - 2127))
    return np.where(window_filt, img, np.nan)

def isolate_window_dashh(img):
    ixx = np.expand_dims(np.arange(img.shape[1]), 0)
    ixy = np.expand_dims(np.arange(img.shape[0]), 1)
    window_filt = ((ixy > (ixx * (-0.00633312)) + 301.60861305)
                   & (ixy < (ixx * (-43.83333333)) + 74061.5)
                   & (ixy < 1079)
                   & (ixy < (ixx * 25.96666667) - 6295.53333333))
    return np.where(window_filt, img, np.nan)

def slight_blur(img, n=5):
    return np.where(np.isnan(img), np.nan,
                    surrounding_cells_stats(img, n, np.nansum) 
                    / np.maximum(surrounding_cells_stats(~np.isnan(img), n, np.nansum), 1.))

# def fix_contrast(img):
#     loc_max = surrounding_cells_stats(img, 50, np.nanmax)
#     return np.where(1 - ((loc_max - img) / np.nanmax(loc_max - img)) < 0.7, 0, 1)

def fix_contrast(img):
    loc_max = surrounding_cells_stats(img, 25, np.nanmax)
    img /= loc_max
    
    loc_max = surrounding_cells_stats(img, 25, np.nanmax)
    loc_min = surrounding_cells_stats(img, 25, np.nanmin)
    img = img + 1 - loc_max
    imgc = 1 - ((1-img) * (1/np.maximum(loc_max-loc_min, 0.001)))
    
    return np.nan_to_num(np.where(loc_max-loc_min < 0.2, img, imgc), nan=1.)

def surrounding_cells_stats(df, padsize, func):
    """
    Only works for nanmin and nanmax.
    """
    df_p = np.pad(df.astype(float), 
                  padsize, 
                  constant_values=np.nan)
    
    rix = np.expand_dims(np.arange(df.shape[0]), (1, 2))\
          + np.expand_dims(np.arange((padsize*2)+1), (0, 1))
    cix = np.expand_dims(np.arange(df_p.shape[1]), (0, 2))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        stats_df = func(df_p[rix, cix], axis=2)
    
    rix = np.expand_dims(np.arange(stats_df.shape[0]), (1, 2))
    cix = np.expand_dims(np.arange(df.shape[1]), (0, 2))\
          + np.expand_dims(np.arange((padsize*2)+1), (0, 1))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        stats_df = func(stats_df[rix, cix], axis=2)
    
    return stats_df

def get_words_from_frame(img):
    cv2.imwrite('output/modified_still.png', 
                np.stack([(img * 255).astype(int)]*3, axis=2))
    return pytesseract.image_to_string('output/modified_still.png')

# Removes double newline characters from text
rmds = lambda x: rmds(x.replace('\n\n', '\n')) if '\n\n' in x else x

# Replaces weird quotes
fixquotes = lambda x: x.replace('“', '"').replace('”', '"').replace('™', '"').replace('’', "'").replace('‘', "'")

# Get rid of all non-UTF-8-friendly characters
to_utf = lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x])

In [27]:
def main(video_fname):
    video = mp.VideoFileClip(f'../../../Effo Code Videos/{video_fname}.mp4')
    text_from_frames = list()
    tt = 0
    while tt < video.duration:
        if video_fname == 'dash_helpers':
            img = isolate_window_dashh(get_monochrome(video.to_ImageClip(t=tt).get_frame(0)))
        else:
            img = isolate_window(get_monochrome(video.to_ImageClip(t=tt).get_frame(0)))
        imgb = slight_blur(img)
        if tt == 0 or np.mean(np.abs(imgb-imgb_prev) > 0.02) > 0.05:
            img = fix_contrast(img)
            wds = get_words_from_frame(img)
            if len(wds) >= 4:
                text_from_frames += [wds]
        tt += 1
        imgb_prev = imgb

    fulltxt = ''
    for f in range(1, len(text_from_frames)):
        txt1 = fixquotes(rmds(text_from_frames[f-1]))
        txt2 = fixquotes(rmds(text_from_frames[f]))
        
        diffs = list()
        for i in range(min(200, len(txt2)) - 3):
            if len(diffs) > 0 and txt2[i:i+4] == txt1[i+diffs[-1]: i+diffs[-1]+4]:
                diffs += [diffs[-1]]
            elif txt2[i:i+4] in txt1:
                diffs += [txt1.index(txt2[i:i+4])-i]
            else:
                diffs += [-1]
        v, c = np.unique(diffs, return_counts=True)
        overlap = v[np.argmax(c)]
        
        if overlap == -1:
            fulltxt += txt1
            fulltxt += f"{'-'*120}\n"
        else:
            fulltxt += txt1[:overlap]
    fulltxt += txt2
    
    with open(f'output/{video_fname}.txt', 'w') as f:
        f.write(to_utf(fulltxt))

In [28]:
main('numpy_helpers')
main('corebcc_faq_app')
main('dash_helpers')
main('helpers_etc')

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'https://clipchamp.com', 'comment': 'Create videos with https://clipchamp.com/en/video-editor - free online video editor, video compressor, video converter.'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': 20094, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 48000, 'bitrate': 192, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 186.4, 'bitrate': 20292, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_