Put title.akas.tsv and title.basic.tsv from https://datasets.imdbws.com/ in data/raw/IMDb folder

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load each TSV file into a DataFrame
title_akas = pd.read_csv(
    "data/raw/IMDb/title.akas.tsv", sep="\t", usecols=["titleId", "title", "region"]
)
title_basics = pd.read_csv(
    "data/raw/IMDb/title.basics.tsv",
    sep="\t",
    usecols=["tconst", "primaryTitle", "titleType", "startYear", "genres"],
)

# Merge the DataFrames on the common column (`tconst` in title.basics and `titleId` in title.akas)
merged_df = pd.merge(title_akas, title_basics, left_on="titleId", right_on="tconst")

# Select only the columns you need and rename `titleId` to `tconst` if you want consistency
imdb_movies = merged_df[
    ["tconst", "title", "primaryTitle", "region", "titleType", "startYear", "genres"]
]

In [3]:
imdb_movies.shape

(50354637, 7)

In [3]:
regions = [
    "SU",
    "RU",
    "UA",
    "BY",
    "KZ",
    "UZ",
    "GE",
    "AM",
    "AZ",
    "LT",
    "LV",
    "EE",
    "TM",
    "KG",
    "TJ",
    "MD",
]

soviet_movies = imdb_movies[
    imdb_movies["region"].fillna("").str.contains("|".join(regions), case=False)
]

soviet_movies = soviet_movies[soviet_movies["titleType"] == "movie"]
soviet_movies = soviet_movies[
    (soviet_movies["startYear"] > "1900") & (soviet_movies["startYear"] < "1991")
].drop_duplicates(subset="primaryTitle", keep="first")

In [5]:
soviet_movies

Unnamed: 0,tconst,title,primaryTitle,region,titleType,startYear,genres
4507,tt0001338,Ночь в мае,A Night in May,RU,movie,1910,Drama
4880,tt0001475,Роковая любовь,Amor fatal,SUHH,movie,1911,"Drama,Romance"
5977,tt0001892,Чёрный сон,Den sorte drøm,RU,movie,1911,Drama
6183,tt0001964,Предательница,The Traitress,RU,movie,1911,Drama
6329,tt0002022,Анфиса,Anfisa,RU,movie,1912,Drama
...,...,...,...,...,...,...,...
50135454,tt9815160,Радуга над островом Цзиньмынь,Kinmontô ni kakeru hashi,SUHH,movie,1962,"Action,War"
50219920,tt9856080,Мезозойская история,Mezozoy Ahvalati,RU,movie,1976,"Drama,History"
50297575,tt9890850,Сказание о любви,Leyli va Macnun,RU,movie,1961,"Drama,History,Romance"
50327050,tt9905224,Стрелок,The Shooter,SUHH,movie,1975,"Adventure,Drama"


In [4]:
from imdb import IMDb

ia = IMDb()


def get_plot_summary(tconst):
    try:
        movie = ia.get_movie(int(str(tconst)[2:]))
        # Retrieve the plot, if available, or return None
        plot = movie.get("plot")
        if plot:
            return plot[0]  # return the first plot summary
        else:
            return None
    except Exception as e:
        return None

In [19]:
i = 1
nrows = soviet_movies.shape[0]
soviet_movies.iloc[nrows - (i + 1) * 2000 : nrows - i * 2000]

Unnamed: 0,tconst,title,primaryTitle,region,titleType,startYear,genres
1776257,tt0314245,К Черному морю,K Chyornomu moryu,SUHH,movie,1958,"Comedy,Romance"
1776278,tt0314249,"Какое оно, море?","Kakoe ono, more?",SUHH,movie,1965,"Drama,Family"
1776290,tt0314254,Преступление Сиро Камисаки,Called to the Stand,SUHH,movie,1956,\N
1776295,tt0314256,Капитан «Старой черепахи»,Kapitan 'Staroy cherepakhi',RU,movie,1956,"Adventure,Mystery"
1776359,tt0314279,Ключи от неба,Klyuchi ot neba,SUHH,movie,1965,"Comedy,Romance"
...,...,...,...,...,...,...,...
22064435,tt18352692,Рапсодия любви,A Rhapsody of Love,RU,movie,1931,"Biography,Drama"
22067109,tt18358148,Король пещеры,Count Monte Christopher,RU,movie,1932,Drama
22079700,tt18363564,Как улучшить жизнь,Getting Better Life,RU,movie,1933,Drama
22080198,tt18363948,Прекрасная жертва,Beautiful Devotion,RU,movie,1933,"Drama,Romance"


In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time

time.sleep(1)


with ThreadPoolExecutor(max_workers=4) as executor:

    soviet_movies["plot"] = list(
        tqdm(
            executor.map(get_plot_summary, soviet_movies["tconst"]),
            total=len(soviet_movies),
        )
    )


soviet_movies.to_csv("data/preprocessed/soviet_movies.tsv", sep="\t")

 43%|████▎     | 12138/28266 [2:40:46<3:45:41,  1.19it/s]2024-11-14 20:17:42,802 CRITICAL [imdbpy] c:\Users\karim\AppData\Local\Programs\Python\Python311\Lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt0071275/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': TimeoutError('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "c:\Users\karim\AppData\Local\Programs\Python\Python311\Lib\site-packages\imdb\parser\http\__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
               ^^^^^^^^^^^^^^^^^
  File "c:\Users\karim\AppData\Local\Programs\Python\Python311\Lib\urllib\request.py", line 519, in open
    response = self._open(req, data)
               ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\karim\AppData\Local\Programs\Python\Python311\Lib\urllib\request.py", line 536, in _open
    result = sel