Put title.akas.tsv and title.basic.tsv from https://datasets.imdbws.com/ in data folder

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load each TSV file into a DataFrame
title_akas = pd.read_csv(
    "data/title.akas.tsv", sep="\t", usecols=["titleId", "title", "region"]
)
title_basics = pd.read_csv(
    "data/title.basics.tsv",
    sep="\t",
    usecols=["tconst", "primaryTitle", "titleType", "startYear", "genres"],
)

# Merge the DataFrames on the common column (`tconst` in title.basics and `titleId` in title.akas)
merged_df = pd.merge(title_akas, title_basics, left_on="titleId", right_on="tconst")

# Select only the columns you need and rename `titleId` to `tconst` if you want consistency
imdb_movies = merged_df[
    ["tconst", "title", "primaryTitle", "region", "titleType", "startYear", "genres"]
]

In [3]:
regions = [
    "SU",
    "RU",
    "UA",
    "BY",
    "KZ",
    "UZ",
    "GE",
    "AM",
    "AZ",
    "LT",
    "LV",
    "EE",
    "TM",
    "KG",
    "TJ",
    "MD",
]

soviet_movies = imdb_movies[
    imdb_movies["region"].fillna("").str.contains("|".join(regions), case=False)
]

soviet_movies = soviet_movies[soviet_movies["titleType"] == "movie"]
soviet_movies = soviet_movies[
    (soviet_movies["startYear"] > "1900") & (soviet_movies["startYear"] < "1991")
].drop_duplicates(subset="primaryTitle", keep="first")

In [4]:
soviet_movies

Unnamed: 0,tconst,title,primaryTitle,region,titleType,startYear,genres
4507,tt0001338,Ночь в мае,A Night in May,RU,movie,1910,Drama
4880,tt0001475,Роковая любовь,Amor fatal,SUHH,movie,1911,"Drama,Romance"
5977,tt0001892,Чёрный сон,Den sorte drøm,RU,movie,1911,Drama
6183,tt0001964,Предательница,The Traitress,RU,movie,1911,Drama
6329,tt0002022,Анфиса,Anfisa,RU,movie,1912,Drama
...,...,...,...,...,...,...,...
50135454,tt9815160,Радуга над островом Цзиньмынь,Kinmontô ni kakeru hashi,SUHH,movie,1962,"Action,War"
50219920,tt9856080,Мезозойская история,Mezozoy Ahvalati,RU,movie,1976,"Drama,History"
50297575,tt9890850,Сказание о любви,Leyli va Macnun,RU,movie,1961,"Drama,History,Romance"
50327050,tt9905224,Стрелок,The Shooter,SUHH,movie,1975,"Adventure,Drama"


In [None]:
from imdb import IMDb

ia = IMDb()


def get_plot_summary(tconst):
    try:
        movie = ia.get_movie(int(str(tconst)[2:]))
        # Retrieve the plot, if available, or return None
        plot = movie.get("plot")
        if plot:
            return plot[0]  # return the first plot summary
        else:
            return None
    except Exception as e:
        return None

In [6]:
import time
from tqdm import tqdm

tqdm.pandas()

soviet_movies = soviet_movies.sample(20)

soviet_movies["plot"] = soviet_movies["tconst"].progress_apply(get_plot_summary)
time.sleep(0.1)

100%|██████████| 20/20 [01:02<00:00,  3.12s/it]


In [7]:
soviet_movies.to_csv("data/soviet_movies.csv")