# Extract subtitles 

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
from bechdelai.data.opensubtitles import search
from bechdelai.data.opensubtitles import get_subtitle_link
from bechdelai.data.opensubtitles import download_subtitle_from_url
from bechdelai.data.opensubtitles import get_subtitles_from_movie

## Search

In [3]:

import chardet
import pysrt
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.opensubtitles.org"

def get_subtitle_link(search_url: str) -> str:
    """Get the first subtitle link from the search url

    Args:
        search_url (str): the search url to get the subtitle link from

    Returns:
        str: the subtitle download link from the search url
    """
    try:
        response = requests.get(search_url)
        response.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        raise Exception("Http Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        raise Exception("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        raise Exception("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        raise Exception("OOps: Something Else", err)

    soup = BeautifulSoup(response.text, "html.parser")
    download_link = soup.select("a[href*=subtitleserve]")
    if not download_link:
        return ""

    return list(map(lambda x: f"{BASE_URL}{x.get('href')}", download_link))

In [4]:
from time import sleep

In [5]:
SEARCH_URL = "https://www.opensubtitles.org/en/search/sublanguageid-eng"
url = SEARCH_URL
n_max = 5_000
offset = 0
all_url = []

while offset < n_max:
    sleep(1) 
    print(url)
    r = get_subtitle_link(url)
    all_url.extend(r)

    offset += len(r)
    url = f"{SEARCH_URL}/offset-{offset}"
    

https://www.opensubtitles.org/en/search/sublanguageid-eng
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-40
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-80
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-120
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-160
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-200
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-240
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-280
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-320
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-360
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-400
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-440
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-480
https://www.opensubtitles.org/en/search/sublanguageid-eng/offset-520
https://www.opensubtitles.org/en/search/sublang

In [10]:
import pandas as pd

In [11]:
tmp = pd.Series(all_url)
tmp.name = "url"

In [14]:
tmp.to_csv("url.csv", index=False)

In [6]:
import zipfile
from io import BytesIO
from os.path import exists

from bechdelai.data.scrap import get_data_from_url

def dl_and_save_srt_file(url, save_dir):
    response = get_data_from_url(url)
    f = BytesIO()
    f.write(response.content)
    save_path = "None"

    try:
        with zipfile.ZipFile(f) as zfile:
            for name in zfile.namelist():
                # If not srt file then continue
                if ".srt" not in name:
                    continue
                
                save_path = f"{save_dir}/{name}"
                if exists(save_path):
                    return None
                zfile.extract(name, save_dir)
                break

    except Exception as e:
        return "Error during unzipping:"

    return save_path

    

In [7]:

for i, url in enumerate(all_url):
    try:
        res = dl_and_save_srt_file(url, save_dir="data/")
    except Exception as e:
        print("========== ERROR:", e)

    if res is not None:
       print(i, res)
       print(url)
    

0 data//better.call.saul.s06e06.720p.web.h264-cakes.Φï▒µûç.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098133
1 data//Heart.of.the.Matter.2022.Hallmark.WebripTV.720p.10bit.hevc.Hi.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098130
2 data//Heart.of.the.Matter.2022.Hallmark.WebripTV.720p.10bit.hevc.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098128
3 data//The.G.Word.with.Adam.Conover.S01E02.Weather.720p.NF.WEB-DL.DDP5.1.x264-SMURF-en-forced.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098106
4 data//The.G.Word.with.Adam.Conover.S01E01.Food.720p.NF.WEB-DL.DDP5.1.x264-SMURF-en-forced.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098105
5 data//the.staircase.2022.s01e05.720p.web.h264-cakes.Hi.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098103
6 data//the.staircase.2022.s01e05.720p.web.h264-cakes.srt
https://www.opensubtitles.org/en/subtitleserve/sub/9098102
7 data//The.G.Word.with.Adam.Conover.S01E06.Change.720p.NF.WEB-DL

KeyboardInterrupt: 