In [1]:
# https://datasets.imdbws.com/title.ratings.tsv.gz
# https://datasets.imdbws.com/title.basics.tsv.gz
# !pip install opensubtitlescom

In [2]:
basics_fp = '../raw_data/title.basics.tsv.gz'
ratings_fp = '../raw_data/title.ratings.tsv.gz'

import gzip
import csv
import pandas as pd

# load the data
basics = pd.read_csv(basics_fp, sep='\t', na_values='\\N')
ratings = pd.read_csv(ratings_fp, sep='\t', na_values='\\N')


  basics = pd.read_csv(basics_fp, sep='\t', na_values='\\N')


In [3]:
# filter out only movies
movies = basics[basics['titleType'] == 'movie']


In [4]:
movies = movies.merge(ratings, on='tconst')

In [5]:
# sort by number of votes
movies = movies.sort_values('numVotes', ascending=False)
movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
61700,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0.0,1994.0,,142.0,Drama,9.3,2880134
137168,tt0468569,movie,The Dark Knight,The Dark Knight,0.0,2008.0,,152.0,"Action,Crime,Drama",9.0,2862710
178074,tt1375666,movie,Inception,Inception,0.0,2010.0,,148.0,"Action,Adventure,Sci-Fi",8.8,2543314
71657,tt0137523,movie,Fight Club,Fight Club,0.0,1999.0,,139.0,Drama,8.8,2315890
60967,tt0109830,movie,Forrest Gump,Forrest Gump,0.0,1994.0,,142.0,"Drama,Romance",8.8,2249667
...,...,...,...,...,...,...,...,...,...,...,...
299077,tt8460942,movie,13 A Ludodrama about Walter Benjamin,13 A Ludodrama about Walter Benjamin,0.0,2018.0,,79.0,"Biography,Documentary,History",7.2,5
211984,tt20115958,movie,Lejos de Casa,Lejos de Casa,0.0,2022.0,,80.0,Documentary,7.0,5
180189,tt14038500,movie,"Love Reborn: Comics, Music & Stories of the Past",Love Reborn,0.0,2018.0,,98.0,"Comedy,Drama,Romance",6.4,5
283899,tt6494064,movie,Scuola in mezzo al mare,Scuola in mezzo al mare,0.0,2018.0,,75.0,Documentary,8.2,5


In [78]:
# remove adult movies
movies = movies[movies['isAdult'] == 0]

In [13]:
# make new column called imdb_id. This is the same as tconst but with 'tt' removed
movies['imdb_id'] = movies['tconst'].str[2:]
# now remove leading zeroes
movies['imdb_id'] = movies['imdb_id'].str.lstrip('0')

In [84]:
df = movies[['tconst', 'imdb_id', 'primaryTitle', 'startYear', 'averageRating', 'numVotes']]
df = df.reset_index(drop=True)
# drop startYear if it is NaN
df = df.dropna(subset=['startYear'])
df['startYear'] = df['startYear'].astype(int)

df['averageRating'] = df['averageRating'].astype(float)
df['numVotes'] = df['numVotes'].astype(int)

# save as tsv
df[:20000].to_csv('../processed_data/movies.tsv', sep='\t', index=False)

In [87]:
from opensubtitlescom import OpenSubtitles
import json
import os
from dotenv import load_dotenv
load_dotenv()  # Loads variables from the .env file

APP_NAME = os.getenv('APP_NAME')
API_KEY = os.getenv('API_KEY')
MY_USERNAME = os.getenv('MY_USERNAME')
MY_PASSWORD = os.getenv('MY_PASSWORD')


In [88]:
# Initialize the OpenSubtitles client
subtitles = OpenSubtitles(APP_NAME, API_KEY)
subtitles.login(MY_USERNAME, MY_PASSWORD)


{'user': {'allowed_translations': 1,
  'allowed_downloads': 20,
  'level': 'Sub leecher',
  'user_id': 680074,
  'ext_installed': False,
  'vip': False},
 'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJTYmZIS2hCWlVCNWpjOTg4TW1OQ0pnVFd5TFVPS21VdyIsImV4cCI6MTcxMjY2NjY0OH0.0UfYuvJKBmHUPWfQpubiZKegJHkSxujzBpmaHXKeLTU',
 'status': 200,
 'base_url': 'api.opensubtitles.com'}

In [74]:
# list movie ids
ids = movies['imdb_id'][:200].tolist()
ids

['111161',
 '468569',
 '1375666',
 '137523',
 '109830',
 '110912',
 '816692',
 '133093',
 '68646',
 '120737',
 '167260',
 '1345836',
 '114369',
 '167261',
 '1853728',
 '172495',
 '361748',
 '993846',
 '372784',
 '102926',
 '120815',
 '7286456',
 '848228',
 '1130884',
 '76759',
 '108052',
 '482571',
 '407887',
 '120689',
 '499549',
 '80684',
 '71562',
 '209144',
 '88763',
 '120338',
 '2015381',
 '4154796',
 '99685',
 '110413',
 '169547',
 '325980',
 '4154756',
 '910970',
 '120382',
 '266697',
 '120586',
 '434409',
 '103064',
 '114814',
 '110357',
 '1431045',
 '371746',
 '1049413',
 '86190',
 '266543',
 '81505',
 '264464',
 '112573',
 '105236',
 '1392190',
 '338013',
 '73486',
 '114709',
 '2267998',
 '119217',
 '107290',
 '477348',
 '167404',
 '82971',
 '1392170',
 '268978',
 '2582802',
 '198781',
 '2488496',
 '6751668',
 '78748',
 '1201607',
 '95016',
 '1675434',
 '88247',
 '3659388',
 '2395427',
 '75314',
 '86250',
 '253474',
 '208092',
 '800369',
 '1300854',
 '1843866',
 '180093',
 '4

In [75]:
subtitle_path = "../raw_data/subtitles/"

for i in ids:
    # Search for subtitles
    response = subtitles.search(imdb_id=i, languages="en", order_by="download_count", order_direction="desc")


    # Convert the response to a Json format
    response_json = response.to_json()
    r = json.loads(response_json)
    title = r['data'][0]['title']

    # Remove special characters but keep spaces
    title = ''.join(e for e in title if e.isalnum() or e.isspace())

    # Check if file exists
    if os.path.exists(f'{subtitle_path}{i} - {title}.txt'):
        continue

    # Download and parse the subtitle object
    srt = subtitles.download_and_parse(response.data[0])

    # Save
    with open(f'{subtitle_path}{i} - {title}.txt', 'w', encoding='utf-8') as f:
        for i in srt:
            # remove '<i>' and '</i>' tags
            i.content = i.content.replace('<i>', '').replace('</i>', '')
            f.write(i.content + '\n')




OpenSubtitlesException: Download limit reached. Please upgrade your account or wait for your quota to reset (~24hrs)