In [2]:
# !pip install opensubtitlescom

In [7]:
# method to download and save imdb data to data/raw/
import requests

def download_data(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

url = 'https://datasets.imdbws.com/'
basics = 'title.basics.tsv.gz'
ratings = 'title.ratings.tsv.gz'
path = '../data/raw/'

# download_data(f'{url}{ratings}', f'{path}{ratings}')
# download_data(f'{url}{basics}', f'{path}{basics}')

In [8]:
import pandas as pd

# load the data
basics_df = pd.read_csv(f'{path}{basics}', sep='\t', na_values='\\N')
ratings_df = pd.read_csv(f'{path}{ratings}', sep='\t', na_values='\\N')

  basics_df = pd.read_csv(f'{path}{basics}', sep='\t', na_values='\\N')


In [9]:
# filter out only movies
movies_df = basics_df[basics_df['titleType'] == 'movie']

In [10]:
movies_df = movies_df.merge(ratings_df, on='tconst')

In [11]:
# sort by number of votes
movies_df = movies_df.sort_values('numVotes', ascending=False)

In [12]:
# remove adult movies
movies_df = movies_df[movies_df['isAdult'] == 0]

In [13]:
# make new column called imdb_id. This is the same as tconst but with 'tt' removed
movies_df['imdb_id'] = movies_df['tconst'].str[2:]

# now remove leading zeroes (for the subtitle api)
movies_df['imdb_id'] = movies_df['imdb_id'].str.lstrip('0')

In [14]:
df = movies_df[['tconst', 'imdb_id', 'primaryTitle', 'genres', 'startYear', 'averageRating', 'numVotes']]
df = df.reset_index(drop=True)

# drop startYear if it is NaN
df = df.dropna(subset=['startYear'])

df['startYear'] = df['startYear'].astype(int)
df['averageRating'] = df['averageRating'].astype(float)
df['numVotes'] = df['numVotes'].astype(int)

# save as tsv
df[:50000].to_csv('../data/processed/movies.tsv', sep='\t', index=False)

In [1]:
from opensubtitlescom import OpenSubtitles
import pandas as pd
import json
import os
from dotenv import load_dotenv
load_dotenv()  # Loads variables from the .env file

APP_NAME = os.getenv('APP_NAME')
API_KEY = os.getenv('API_KEY')
MY_USERNAME = os.getenv('MY_USERNAME')
MY_PASSWORD = os.getenv('MY_PASSWORD')

# Initialize the OpenSubtitles client
subtitles = OpenSubtitles(APP_NAME, API_KEY)
subtitles.login(MY_USERNAME, MY_PASSWORD)

df = pd.read_csv('../data/processed/movies.tsv', sep='\t')

In [2]:
# list movie ids
ids = df['imdb_id'][:20000].tolist()
# ids
# get title from df based on imdb_id
def get_title(imdb_id):
    return df[df['imdb_id'] == imdb_id]['primaryTitle'].values[0]


In [3]:
import time

subtitle_path = "../data/raw/subtitles/"
subs_list = os.listdir(subtitle_path)

# if id is in missing.txt, skip
with open(subtitle_path + 'missing.txt') as f:
    missing = f.read().splitlines()

for i in ids:
    if str(i) in missing:
        print(f'{i}, {get_title(i)} is missing')
        continue

    prefix = f'{subtitle_path}{i} - '  # Construct the base part of the filename

    sub_found = False

    # Check if subtitles already exist
    for file in os.listdir(subtitle_path):
        if file.startswith(f'{i} - '):
            sub_found = True
            break

    if sub_found:
        # print(f'Subtitles for {i} already exist')
        continue

    try:
        # Search for subtitles based on imdb_id
        # response = subtitles.search(imdb_id=i, languages="en", order_by="download_count", order_direction="desc")
        response = subtitles.search(imdb_id=i, languages="en")
        # time.sleep(1)

        # Convert the response to a Json format
        response_json = response.to_json()
        r = json.loads(response_json)
    except:
        print(f'Error searching for subtitles for {i} - {get_title(i)}')
        time.sleep(5)
        continue

    try:
        title = r['data'][0]['title'] # Get the title of the movie
    except:
        print(f'No subtitle found for {i} - {get_title(i)}')
        # append id to missing.txt
        with open(subtitle_path + 'missing.txt', 'a') as f:
            f.write(f'{i}\n')
        continue

    # Remove special characters but keep spaces
    title = ''.join(e for e in title if e.isalnum() or e.isspace())

    # Download and parse the subtitle object
    try:
        srt = subtitles.download_and_parse(response.data[0])
    except:
        print(f'Error downloading subtitle for {title}')
        srt = subtitles.download_and_parse(response.data[1])

    # Save
    with open(f'{prefix}{title}.txt', 'w', encoding='utf-8') as f:
        for i in srt:
            # remove '<i>' and '</i>' tags
            i.content = i.content.replace('<i>', '').replace('</i>', '')
            f.write(i.content + '\n')

9531772, Sooryavanshi is missing
5988370, Reis is missing
7825208, Marighella is missing
7668842, Enes Batur: Imagination or Reality? is missing
1725047, My Little Princess is missing
4884540, Angels Apocalypse is missing
85809, Koyaanisqatsi is missing
6038600, Smolensk is missing
7221896, Cumali Ceber: Allah Seni Alsin is missing
29808429, Color of Victory is missing
27719898, Kaiva is missing
3666024, The Red Turtle is missing
4465156, Unbreakable Souls is missing
1788979, 67th Street, New York, NY is missing
15145764, Freddy is missing
6990206, Crimea is missing
4330758, Black Hawk Down is missing
1754730, The Way to Paradise is missing
9851854, Major is missing
2183014, Love at First Sight is missing
3655326, Life Is a Dream is missing
3120430, Mantus is missing
4458206, Code Name: K.O.Z. is missing
4972, The Birth of a Nation is missing
13028258, A Thursday is missing
4983780, Maanaadu is missing
11580854, Sarpatta Parambarai is missing
4736550, The Great Hack is missing
28362963

OpenSubtitlesException: Download limit reached. Please upgrade your account or wait for your quota to reset (~24hrs)