## Rename ripped dvd tracks with correct season-episode paring

In [None]:
import base64
import difflib
from glob import glob
import hashlib
import json
import os
import shutil
import subprocess
import time
import zipfile

from bs4 import BeautifulSoup as bs
import torch
import requests
import torchaudio

In [None]:
VIDEO_ROOT = "/mnt/storage/Series/"
AUDIO_ROOT = "/home/neil/temp/mkv_audio"
subtitles_file = "/mnt/storage/Series/subitles.json"

# Download the intro subtitles

In [None]:
print(f"Manually set the url for the correct series")
print(f"Manually set the range end value to make sure to get all the pages.")
requests_end_idx = 175
series_url_root = "https://transcripts.foreverdreaming.org/viewforum.php?f=-1"

results = []
completed = set()
for offset in range(0,requests_end_idx,25):
    done = False
    r = requests.get(f"{series_url_root}&start={offset}")
    soup = bs(r.text)
    for item in soup.find_all("h3"):
        title = item.text
        if title[:2].isnumeric():
            title_split  = title.split(" - ")
            ep = title_split[0]
            name = " - ".join(title_split[1:])
            ep = ep.replace("x","E").replace("/","&")
            print(f"S{ep} {name}")
            if ep in completed:
                done=True
                break
            results.append(
                {
                    "se": "S" + ep,
                    "title": name,
                    "text": item.contents[0].attrs['title'].strip('.'),
                }
            )
    if done:
        break
    time.sleep(1) # rate limit it

with open(subtitles_file, 'w') as fp:
    json.dump(results, fp, indent=2)

# Find all the videos and extract the first 15 seconds of audio

In [None]:
class VideoAudio:
    text = None
    text_clean = None

    def __init__(self, video_file, audio_temp_dir = AUDIO_ROOT):
        self.video = video_file
        self.audio = os.path.join(audio_temp_dir, base64.b32encode(hashlib.sha256(self.video.encode()).digest()).decode() + ".wav")
        self.canidates = []

        self.create_audio()

    def create_audio(self):
        if not os.path.isfile(self.audio):
            os.system(f'ffmpeg -i "{self.video}" -t 00:00:15.0 -ac 1 "{self.audio}"')

    def set_text(self, text):
        self.text = text
        self.clean_text = text.lower()
        for to_replace in ["'", '"', ",", ".", "-", "!", "?"]:
            self.clean_text = self.clean_text.replace(to_replace, "")

    def to_dict(self):
        return {
            "video": self.video,
            "audio": self.audio,
            "text": self.text,
            "clean_text": self.clean_text,
            "canidates": self.canidates,
        }
    

In [None]:
videos = []
for root, _, fnames in os.walk(VIDEO_ROOT):
    print(root)
    for fname in fnames:
        print(fname)
        if "bonus" in fname.lower():
            continue
        if fname[5] != "_":
            print("underscore")
            continue
        if os.path.splitext(fname)[-1] != ".mkv":
            continue

        file_in = os.path.join(root, fname)
        print(file_in)
        f = VideoAudio(file_in, audio_temp_dir=AUDIO_ROOT)
        videos.append(f)

In [None]:
len(videos)

# Run audio to text model over start of video files

In [None]:
device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU

model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en', # also available 'de', 'es'
                                       device=device)
                                       
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils  # see function signature for details


In [None]:
for video in videos:
    if video.text_clean is not None:
        continue
    data_input = prepare_model_input(read_batch([video.audio]), device=device)
    output = model(data_input)
    text = decoder(output[0].cpu())
    video.set_text(text)
    #break

In [None]:
# Clean the subtitles, this helps the matcher
with open(subtitles_file, 'r') as fp:
    labels = json.load(fp)

for item in labels:
    t = item["text"].lower().strip()
    for to_replace in ["'", '"', ",", ".", "-", "!", "?"]:
        t = t.replace(to_replace, "")
    
    item["text"] = t
print(f"{len(labels)=}")

In [None]:
# Apply the python builtin sequence matcher

for video in videos:
    # Remove exsiting matches before re-matching
    video.canidates = []

    for idx_l, l in enumerate(labels):
        target = l["text"]
        length = min(len(target), len(video.clean_text))
        sim = difflib.SequenceMatcher(None, video.clean_text[:length], target[:length]).ratio()
        if sim > 0.5:
            video.canidates.append({**dict(l), "sim": sim})
    
    video.canidates.sort(key=lambda x: x["sim"], reverse=True)
    print(f"{video.video}")
    for canidate in video.canidates:
        print(f"\t{' '*7}{video.text}")
        print(f"\t{canidate['se']:6s}-{canidate['text']}")

In [None]:
# Duplicate strong video canidates for an episode
from collections import defaultdict
used = defaultdict(list)
for video in videos:
    if (len(video.canidates) == 1):
        if video.canidates[0]['se'] in used:
            print(f"Found duplicate se {video.canidates[0]['se']} for video {video.video}")
        used[video.canidates[0]['se']].append(video.video)

for key, val in used.items():
    if len(val) <= 1:
        continue
    print(f"\n{key}")
    for v in val:
        print(f"\t{v}")


# Rename the files

In [None]:
# Move all videos that have exactly 1 strong match
for video in videos:
    if (len(video.canidates) == 1) and (video.canidates[0]['sim'] > 0.7):
        fname_new = os.path.join(os.path.dirname(video.video), f"{video.canidates[0]['se']} {video.canidates[0]['title']}{os.path.splitext(video.video)[-1]}")
        if not os.path.isfile(video.video):
            print(f"Source file not found for {video.video} -> {fname_new}")
            continue
        elif os.path.isfile(fname_new):
            print(f"Destination file already exists for {video.video} -> {fname_new}")
            continue
        shutil.move(video.video, fname_new)