In [2]:
# !pip install opensubtitlescom

In [7]:
# method to download and save imdb data to data/raw/
import requests

def download_data(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

url = 'https://datasets.imdbws.com/'
basics = 'title.basics.tsv.gz'
ratings = 'title.ratings.tsv.gz'
path = '../data/raw/'

# download_data(f'{url}{ratings}', f'{path}{ratings}')
# download_data(f'{url}{basics}', f'{path}{basics}')

In [8]:
import pandas as pd

# load the data
basics_df = pd.read_csv(f'{path}{basics}', sep='\t', na_values='\\N')
ratings_df = pd.read_csv(f'{path}{ratings}', sep='\t', na_values='\\N')

  basics_df = pd.read_csv(f'{path}{basics}', sep='\t', na_values='\\N')


In [9]:
# filter out only movies
movies_df = basics_df[basics_df['titleType'] == 'movie']

In [10]:
movies_df = movies_df.merge(ratings_df, on='tconst')

In [11]:
# sort by number of votes
movies_df = movies_df.sort_values('numVotes', ascending=False)

In [12]:
# remove adult movies
movies_df = movies_df[movies_df['isAdult'] == 0]

In [13]:
# make new column called imdb_id. This is the same as tconst but with 'tt' removed
movies_df['imdb_id'] = movies_df['tconst'].str[2:]

# now remove leading zeroes (for the subtitle api)
movies_df['imdb_id'] = movies_df['imdb_id'].str.lstrip('0')

In [14]:
df = movies_df[['tconst', 'imdb_id', 'primaryTitle', 'genres', 'startYear', 'averageRating', 'numVotes']]
df = df.reset_index(drop=True)

# drop startYear if it is NaN
df = df.dropna(subset=['startYear'])

df['startYear'] = df['startYear'].astype(int)
df['averageRating'] = df['averageRating'].astype(float)
df['numVotes'] = df['numVotes'].astype(int)

# save as tsv
df[:50000].to_csv('../data/processed/movies.tsv', sep='\t', index=False)

In [15]:
from opensubtitlescom import OpenSubtitles
import pandas as pd
import json
import os
from dotenv import load_dotenv
load_dotenv()  # Loads variables from the .env file

APP_NAME = os.getenv('APP_NAME')
API_KEY = os.getenv('API_KEY')
MY_USERNAME = os.getenv('MY_USERNAME')
MY_PASSWORD = os.getenv('MY_PASSWORD')

# Initialize the OpenSubtitles client
subtitles = OpenSubtitles(APP_NAME, API_KEY)
subtitles.login(MY_USERNAME, MY_PASSWORD)

df = pd.read_csv('../data/processed/movies.tsv', sep='\t')

In [16]:
# list movie ids
ids = df['imdb_id'][:20000].tolist()
# ids
# get title from df based on imdb_id
def get_title(imdb_id):
    return df[df['imdb_id'] == imdb_id]['primaryTitle'].values[0]


In [17]:
import time

subtitle_path = "../data/raw/subtitles/"
subs_list = os.listdir(subtitle_path)

# if id is in missing.txt, skip
with open(subtitle_path + 'missing.txt') as f:
    missing = f.read().splitlines()

for i in ids:
    if str(i) in missing:
        print(f'{i}, {get_title(i)} is missing')
        continue

    prefix = f'{subtitle_path}{i} - '  # Construct the base part of the filename

    sub_found = False

    # Check if subtitles already exist
    for file in os.listdir(subtitle_path):
        if file.startswith(f'{i} - '):
            sub_found = True
            break

    if sub_found:
        # print(f'Subtitles for {i} already exist')
        continue

    try:
        # Search for subtitles based on imdb_id
        # response = subtitles.search(imdb_id=i, languages="en", order_by="download_count", order_direction="desc")
        response = subtitles.search(imdb_id=i, languages="en")
        # time.sleep(1)

        # Convert the response to a Json format
        response_json = response.to_json()
        r = json.loads(response_json)
    except:
        print(f'Error searching for subtitles for {i} - {get_title(i)}')
        time.sleep(5)
        continue

    try:
        title = r['data'][0]['title'] # Get the title of the movie
    except:
        print(f'No subtitle found for {i} - {get_title(i)}')
        # append id to missing.txt
        with open(subtitle_path + 'missing.txt', 'a') as f:
            f.write(f'{i}\n')
        continue

    # Remove special characters but keep spaces
    title = ''.join(e for e in title if e.isalnum() or e.isspace())

    # Download and parse the subtitle object
    try:
        srt = subtitles.download_and_parse(response.data[0])
    except:
        print(f'Error downloading subtitle for {title}')
        srt = subtitles.download_and_parse(response.data[1])

    # Save
    with open(f'{prefix}{title}.txt', 'w', encoding='utf-8') as f:
        for i in srt:
            # remove '<i>' and '</i>' tags
            i.content = i.content.replace('<i>', '').replace('</i>', '')
            f.write(i.content + '\n')

9531772, Sooryavanshi is missing
5988370, Reis is missing
7825208, Marighella is missing
7668842, Enes Batur: Imagination or Reality? is missing
1725047, My Little Princess is missing
4884540, Angels Apocalypse is missing
85809, Koyaanisqatsi is missing
6038600, Smolensk is missing
7221896, Cumali Ceber: Allah Seni Alsin is missing
29808429, Color of Victory is missing
27719898, Kaiva is missing
3666024, The Red Turtle is missing
4465156, Unbreakable Souls is missing
1788979, 67th Street, New York, NY is missing
15145764, Freddy is missing
6990206, Crimea is missing
4330758, Black Hawk Down is missing
1754730, The Way to Paradise is missing
9851854, Major is missing
2183014, Love at First Sight is missing
3655326, Life Is a Dream is missing
3120430, Mantus is missing
4458206, Code Name: K.O.Z. is missing
4972, The Birth of a Nation is missing
13028258, A Thursday is missing
4983780, Maanaadu is missing
11580854, Sarpatta Parambarai is missing
4736550, The Great Hack is missing
28362963

SRTParseError: Expected contiguous start of match or end of input at char 0, but started at char 6367 (unmatched content: '<!DOCTYPE html>\n<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->\n<head>\n\n\n<title>www.opensubtitles.com | 502: Bad gateway</title>\n<meta charset="UTF-8" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n<meta name="robots" content="noindex, nofollow" />\n<meta name="viewport" content="width=device-width,initial-scale=1" />\n<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />\n\n\n</head>\n<body>\n<div id="cf-wrapper">\n    <div id="cf-error-details" class="p-0">\n        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">\n            <h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">\n              <span class="inline-block">Bad gateway</span>\n              <span class="code-label">Error code 502</span>\n            </h1>\n            <div>\n               Visit <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=www.opensubtitles.com" target="_blank" rel="noopener noreferrer">cloudflare.com</a> for more information.\n            </div>\n            <div class="mt-3">2024-05-11 03:00:04 UTC</div>\n        </header>\n        <div class="my-8 bg-gradient-gray">\n            <div class="w-240 lg:w-full mx-auto">\n                <div class="clearfix md:px-8">\n                  \n<div id="cf-browser-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">\n  <div class="relative mb-10 md:m-0">\n    \n    <span class="cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat"></span>\n    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>\n    \n  </div>\n  <span class="md:block w-full truncate">You</span>\n  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">\n    \n    Browser\n    \n  </h3>\n  <span class="leading-1.3 text-2xl text-green-success">Working</span>\n</div>\n\n<div id="cf-cloudflare-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">\n  <div class="relative mb-10 md:m-0">\n    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=www.opensubtitles.com" target="_blank" rel="noopener noreferrer">\n    <span class="cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat"></span>\n    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>\n    </a>\n  </div>\n  <span class="md:block w-full truncate">Chicago</span>\n  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">\n    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=www.opensubtitles.com" target="_blank" rel="noopener noreferrer">\n    Cloudflare\n    </a>\n  </h3>\n  <span class="leading-1.3 text-2xl text-green-success">Working</span>\n</div>\n\n<div id="cf-host-status" class="cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">\n  <div class="relative mb-10 md:m-0">\n    \n    <span class="cf-icon-server block md:hidden h-20 bg-center bg-no-repeat"></span>\n    <span class="cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>\n    \n  </div>\n  <span class="md:block w-full truncate">www.opensubtitles.com</span>\n  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">\n    \n    Host\n    \n  </h3>\n  <span class="leading-1.3 text-2xl text-red-error">Error</span>\n</div>\n\n                </div>\n            </div>\n        </div>\n\n        <div class="w-240 lg:w-full mx-auto mb-8 lg:px-8">\n            <div class="clearfix">\n                <div class="w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed">\n                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What happened?</h2>\n                    <p>The web server reported a bad gateway error.</p>\n                </div>\n                <div class="w-1/2 md:w-full float-left leading-relaxed">\n                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What can I do?</h2>\n                    <p class="mb-6">Please try again in a few minutes.</p>\n                </div>\n            </div>\n        </div>\n\n        <div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">\n  <p class="text-13">\n    <span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">881eeaa97ffd2237</strong></span>\n    <span class="cf-footer-separator sm:hidden">&bull;</span>\n    <span id="cf-footer-item-ip" class="cf-footer-item hidden sm:block sm:mb-1">\n      Your IP:\n      <button type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>\n      <span class="hidden" id="cf-footer-ip">172.125.100.90</span>\n      <span class="cf-footer-separator sm:hidden">&bull;</span>\n    </span>\n    <span class="cf-footer-item sm:block sm:mb-1"><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=www.opensubtitles.com" id="brand_link" target="_blank">Cloudflare</a></span>\n    \n  </p>\n  <script>(function(){function d(){var b=a.getElementById("cf-footer-item-ip"),c=a.getElementById("cf-footer-ip-reveal");b&&"classList"in b&&(b.classList.remove("hidden"),c.addEventListener("click",function(){c.classList.add("hidden");a.getElementById("cf-footer-ip").classList.remove("hidden")}))}var a=document;document.addEventListener&&a.addEventListener("DOMContentLoaded",d)})();</script>\n</div><!-- /.error-footer -->\n\n\n    </div>\n</div>\n</body>\n</html>\n')