In [29]:
from imdb import Cinemagoer
import json
from time import time
from multiprocessing import Pool
import pandas as pd

# Update ratings

In [2]:
import json
with open("movies_with_keywords_embeddings.json", "r") as infile:
    movies = json.load(infile)

In [9]:
imdb_ids = [(movie_id, movie["imdb_id"].replace("tt", "")) for movie_id, movie in movies.items() if movie["imdb_id"] is not None]

## With Cinemagoer as proxy (slow)

In [26]:
def update_movie_ratings(movie_tuple):
    try:
        movie_id, imdb_id = movie_tuple
        print(f"before {movies[movie_id]['vote_average']}")
        movie = ia.get_movie(imdb_id)
        new_rating = movie.get("rating")
        print(f"after {new_rating}")
        new_vote_count = movie.get("votes")
        movies[movie_id]["vote_average"] = new_rating
        movies[movie_id]["vote_count"] = new_vote_count
    except:
        pass

In [29]:
ia = Cinemagoer()
setup_start_time = time()
with Pool() as pool:
    results = pool.map(update_movie_ratings, imdb_ids)
setup_end_time = time()
print(f"setup latency = {setup_end_time - setup_start_time:.4f}s")

before 7.8
after 8.2
setup latency = 5.1764s


In [10]:
len(imdb_ids)

44476

## With IMDB dataset

In [12]:
all_ids = [movie["imdb_id"] for movie in movies.values()]

In [40]:
# Data from https://datasets.imdbws.com/
ratings_df = pd.read_csv("../../data.tsv", sep="\t")

In [41]:
len(ratings_df)

1295778

In [45]:
ratings_df.averageRating.describe()

count    1.295778e+06
mean     6.949702e+00
std      1.384102e+00
min      1.000000e+00
25%      6.200000e+00
50%      7.100000e+00
75%      7.900000e+00
max      1.000000e+01
Name: averageRating, dtype: float64

In [19]:
ratings_df = ratings_df[ratings_df["tconst"].isin(all_ids)]

In [20]:
len(ratings_df)

44414

In [25]:
ratings_dict = {}
ratings2dict = ratings_df.to_dict(orient="index")
for rating_dict in ratings2dict.values():
    ratings_dict[rating_dict["tconst"]] = {"rating": rating_dict['averageRating'], "votes": rating_dict['numVotes']}

In [27]:
# Define imdb id to movie id
imdb_id_to_movies_id = {}
skip_ids = ["219160", "33753", "36955"]
for movie_id, movie_data in movies.items():
    if movie_id not in skip_ids:
        try:
            imdb_id = movie_data["imdb_id"]
        except:
            id_1 = movie_id
            raise ValueError(f"not imdb id for {movie_id}")
        if imdb_id in imdb_id_to_movies_id:
            id_1 = imdb_id_to_movies_id[imdb_id]
            id_2 = movie_id
            raise ValueError(f"alredy defined {imdb_id}: defined for {imdb_id_to_movies_id[imdb_id]} and {movie_id}")
        else:
            imdb_id_to_movies_id[imdb_id] = movie_id

In [35]:
for imdb_id, rating_dict in ratings_dict.items():
    movie_id = imdb_id_to_movies_id[imdb_id]
    if "vote_count" not in movies[movie_id] or movies[movie_id]["vote_count"] is None or movies[movie_id]["vote_count"] < rating_dict["votes"]:
        #print(f"Uploading {movie_id} from {movies[movie_id]['vote_average']} to {rating_dict['rating']}")
        movies[movie_id]["vote_average"] = rating_dict["rating"]
        movies[movie_id]["vote_count"] = rating_dict["votes"]

In [46]:
movies[next(iter(movies))]

{'adult': 'False',
 'belongs_to_collection': "{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",
 'budget': '30000000',
 'genres': "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': '862',
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 21.946943,
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'production_companies': "[{'name': 'Pixar Animation Studios', 'id': 3}]",
 'production_countries': "[{'iso_3166_1': '

In [57]:
with open("movies_with_keywords_embeddings_31_03_2023.json", "w") as outfile:
    json.dump(movies, outfile)