<a href="https://colab.research.google.com/github/profteachkids/subtitle_generator/blob/main/subtitle_timeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython
!pip install bidict
!pip install omegaconf
!pip install torchaudio

Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 5.0 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79
Collecting bidict
  Downloading bidict-0.21.2-py2.py3-none-any.whl (37 kB)
Installing collected packages: bidict
Successfully installed bidict-0.21.2
Collecting omegaconf
  Downloading omegaconf-2.1.0-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 386 kB/s 
[?25hCollecting PyYAML>=5.1.*
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 8.2 MB/s 
[?25hCollecting antlr4-python3-runtime==4.8
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 10.9 MB/s 
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25h

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 5.3 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [None]:
import numpy as np
from lxml import etree
import subprocess as sp
import os
import torch
import pandas as pd
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from bidict import bidict
import string
from glob import glob
import re
import shutil

In [None]:
text = """after the questionable
events of the last part
where Spadeless lost
his pets"""

text = re.sub(r"[,.!?-] ", " ", text, 0, re.MULTILINE)
text = text.lower()
text = text.split()


In [None]:
DEVNULL = open(os.devnull, "w")

def ffmpeg_load_audio(
    filename,
    sr=44100,
    mono=False,
    normalize=True,
    in_type=np.int16,
    out_type=np.float32,
):
    channels = 1 if mono else 2
    format_strings = {
        np.float64: "f64le",
        np.float32: "f32le",
        np.int16: "s16le",
        np.int32: "s32le",
        np.uint32: "u32le",
    }
    format_string = format_strings[in_type]
    command = [
        "ffmpeg",
        "-i",
        filename,
        "-f",
        format_string,
        "-acodec",
        "pcm_" + format_string,
        "-ar",
        str(sr),
        "-ac",
        str(channels),
        "-",
    ]
    p = sp.Popen(command, stdout=sp.PIPE, stderr=DEVNULL, bufsize=4096)
    bytes_per_sample = np.dtype(in_type).itemsize
    frame_size = bytes_per_sample * channels
    chunk_size = frame_size * sr  # read in 1-second chunks
    raw = b""
    with p.stdout as stdout:
        while True:
            data = stdout.read(chunk_size)
            if data:
                raw += data
            else:
                break
    audio = np.fromstring(raw, dtype=in_type).astype(out_type)
    if channels > 1:
        audio = audio.reshape((-1, channels)).transpose()
    if audio.size == 0:
        return audio, sr
    if issubclass(out_type, np.floating):
        if normalize:
            peak = np.abs(audio).max()
            if peak > 0:
                audio /= peak
        elif issubclass(in_type, np.integer):
            audio /= np.iinfo(in_type).max
    return audio, sr

def to_text(f):
    print(f)
    audio, sr = ffmpeg_load_audio(
        f, sr=16000, in_type=np.float32, out_type=np.float32
    )
    audio = np.expand_dims(audio.astype(np.float32)[0], 0)


    device = torch.device("cpu")
    print('loading')
    model, decoder, utils = torch.hub.load(
        repo_or_dir="snakers4/silero-models",
        model="silero_stt",
        language="en",  # also available 'de', 'es'
        device=device,
    )
    print('finished loading')

    res = []
    batch_size =  3*sr
    N = audio.size // batch_size

    sections = np.array_split(audio, N, 1)
    section_start=0
    for i, section in enumerate(sections):
        input=torch.from_numpy(section)
        output = model(input)[0]
        decoded=decoder(output.cpu(), section.size, word_align=True)
        if len(decoded)==2:
            s,dlist = decoded
            for d in dlist:
                d["start_ts"] = (d["start_ts"] + section_start)/sr
                d["end_ts"] = (d["end_ts"] +  section_start)/sr
                print(d["word"], d["start_ts"], d["end_ts"])
                res.append(d)
            section_start+=section.size


    df = pd.DataFrame(res)
    df.to_csv("voice_times.csv")

    voice_words = df["word"].values
    print(" ".join(voice_words))

def align():
    df = pd.read_csv("voice_times.csv")
    voice_words = df["word"].values
    start_times = df["start_ts"].values
    end_times = df["end_ts"].values

    all_words = set(text).union(set(voice_words))

    word_dict = bidict(zip(all_words, range(len(all_words))))
    word_dict["-"] = "-"
    voice = [word_dict[word] for word in voice_words]
    script = [word_dict[word] for word in text]
    align = pairwise2.align.globalxx(voice, script, gap_char =['-'], one_alignment_only=True)[0]
    matches=[]
    mismatches=[]
    mismatch_voice=word_dict.inverse[align.seqA[0]]   
    mismatch_text=word_dict.inverse[align.seqB[0]]
    v_end_pos=0
    v_start_pos=0
    t_end_pos=0
    t_start_pos=0
    for i,(v, t) in enumerate(zip(align.seqA[1:], align.seqB[1:])):
        print(word_dict.inverse[v], word_dict.inverse[t])
        if v==t:
            mismatches.append([t_start_pos, t_end_pos, start_times[v_start_pos], end_times[v_end_pos], mismatch_voice, mismatch_text])
            v_end_pos+=1
            v_start_pos=v_end_pos
            t_end_pos+=1
            t_start_pos=t_end_pos
            mismatch_voice=word_dict.inverse[v]
            mismatch_text=word_dict.inverse[t]
        else:
            if word_dict.inverse[v] != '-' :
                v_end_pos+=1
            if word_dict.inverse[t] != '-' :
                t_end_pos+=1
            mismatch_voice+=' ' + word_dict.inverse[v]
            mismatch_text+=' ' + word_dict.inverse[t]


    mismatches.append([t_start_pos, t_end_pos, start_times[v_start_pos], end_times[v_end_pos], mismatch_voice, mismatch_text])

    idx=[]
    times=[]
    for t_start_pos, t_end_pos, start_time, end_time, v, t in mismatches:
        print('-'*30)
        print(v)
        print(t)
        print(t_start_pos,t_end_pos, start_time, end_time)
        d={}

        t2 = t.translate(t.maketrans(string.punctuation, " " * len(string.punctuation)))
        t2 = " ".join(t2.split())

        idx.append(t_start_pos)
        times.append(start_time)

    idx.append(t_end_pos+1)
    times.append(end_time)

    np.savez('it',idx=np.array(idx),time=np.array(times))
    it = np.load('it.npz')
    print(it['idx'], len(it['idx']))
    print(it['time'], len(it['time']))

def make_copies():
    it = np.load('it.npz')
    idx=it['idx']
    times=it['time']
    print(times, len(times))
    path_out='E:\\Blender\\BlenderVideo\\'
    path_in='E:\\Blender\\BlenderOut\\'

    i=0
    subtitle_images = glob(path_in+'*.png')
    print(subtitle_images)
    for subtitle_image in subtitle_images:
        name, subtitle_n, w_start, w_end = subtitle_image.split('_')
        *_, name = name.split('\\')
        w_end,_ = w_end.split('.')
        w_start, w_end, subtitle_n =int(w_start), int(w_end), int(subtitle_n)
        t_start, t_end = np.interp([w_start, w_end], idx, times)
        f_start, f_end = int(t_start*10), int(t_end*10)
        print(name, w_start, w_end, t_start, t_end, f_start, f_end)
        for copy_n in range(f_end-f_start+1):
            
            name=f'{i:04d}'.translate(str.maketrans('0123456789','abcdefghij'))
            shutil.copyfile(subtitle_image, path_out+name+'.png')
            i+=1

def set_timeline():
    it = np.load('it.npz')
    idx=it['idx']
    times=it['time']


    tree = etree.parse('Timeline 1.xml')
    clips=tree.xpath('//video//clipitem')
    for clip in clips:
        name=clip.xpath('name/text()')[0]
        start=clip.xpath('start')[0]
        end=clip.xpath('end')[0]
        name, subtitle_n, w_start, w_end = name.split('_')
        *_, name = name.split('\\')
        w_end,_ = w_end.split('.')
        w_start, w_end, subtitle_n =int(w_start), int(w_end), int(subtitle_n)
        t_start, t_end = np.interp([w_start, w_end], idx, times)
        start.text=str(int(t_start*60))
        end.text=str(int(t_end*60))

    f = open('subtitle_timeline.xml', 'wb')
    f.write(etree.tostring(tree, pretty_print=True))
    f.close()



In [None]:
to_text("audio.mov")
align()    
set_timeline()

audio.mov


Downloading: "https://github.com/snakers4/silero-models/archive/master.zip" to /root/.cache/torch/hub/master.zip


loading


HBox(children=(FloatProgress(value=0.0, max=117375227.0), HTML(value='')))


finished loading
after 0.118656875 0.43507437499999996
the 0.5932837500000001 0.8305968749999999
questionable 0.8305968749999999 1.423880625
events 1.423880625 1.7798506250000001
of 1.858955 2.05671625
the 2.05671625 2.2544775
last 2.2544775 2.4917912500000003
part 2.4917912500000003 2.729104375
were 2.887313125 3.124626875
spedless 3.124626875 3.6783581250000004
moss 3.7574625 4.152984999999999
his 4.152984999999999 4.39029875
pets 4.4694031249999995 4.864925625
after the questionable events of the last part were spedless moss his pets
the the
questionable questionable
events events
of of
the the
last last
part part
were -
spedless -
moss -
- where
- spadeless
- lost
his his
pets pets
------------------------------
after
after
0 0 0.118656875 0.435074375
------------------------------
the
the
1 1 0.5932837500000001 0.8305968749999999
------------------------------
questionable
questionable
2 2 0.8305968749999999 1.423880625
------------------------------
events
events
3 3 1.423880625

  return forward_call(*input, **kwargs)
