# Importing data

In [1]:
import requests as r
import json
import pandas as pd
from many_requests import ManyRequests
import numpy as np
import csv

In [2]:
# Files to be imported
path_links = "./links.csv"
path_imdb_movies = "./movie_ids.json"
path_ratings = "./ratings.csv"

# Files to be created
path_relevant_ratings = "./relevant_ratings_comp.csv"
path_movies = "./movies.csv"
path_crew_cast = "./crew_cast.csv"

NMOVIES = 50000

API_KEY = "341aa223bceaf3285598e2a6511bcd4b"

## Importing movies and movie IDs

### Movies

In [3]:
df_movies = pd.read_json(path_imdb_movies, lines = True)
df_movies.rename(columns = {"id":"tmdb_id"}, inplace = True)
df_movies.head()

Unnamed: 0,adult,tmdb_id,original_title,popularity,video
0,False,3924,Blondie,2.425,False
1,False,6124,Der Mann ohne Namen,0.961,False
2,False,8773,L'Amour à vingt ans,2.746,False
3,False,25449,New World Disorder 9: Never Enough,1.562,False
4,False,31975,Sesame Street: Elmo Loves You!,1.4,True


### Movie IDs

In [4]:
df_links = pd.read_csv("./links.csv")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
# Cleaning Null and duplicates values, switching tmdbId to integer
df_links = df_links.dropna()
df_links["tmdbId"] = df_links["tmdbId"].astype(int)
df_links.drop_duplicates(subset = ["tmdbId"], inplace = True)
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [6]:
df_valid_links = df_links[df_links["tmdbId"].isin(df_movies["tmdb_id"])]
df_valid_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [7]:
tmdb_ids = df_valid_links["tmdbId"].head(NMOVIES).to_numpy()
movie_ids = df_valid_links["movieId"].head(NMOVIES).to_numpy()

In [8]:
df_valid_links[df_valid_links["tmdbId"] == 470358]

Unnamed: 0,movieId,imdbId,tmdbId
18775,92783,1541777,470358


## Importing movie metadata, crew and cast, ratings

### Metadata

In [9]:
%%time

responses_metadata = ManyRequests(n_workers=50, n_connections=30, json=True)(
    method='GET',
    url=[f"https://api.themoviedb.org/3/movie/{ID}?api_key={API_KEY}&language=en-US" for ID in tmdb_ids])

  0%|          | 0/50000 [00:00<?, ?it/s]

  class ExceptionGroup(BaseExceptionGroup, trio.MultiError):


CPU times: total: 15min 46s
Wall time: 21min 1s


In [11]:
responses_metadata[0]

{'adult': False,
 'backdrop_path': '/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg',
 'belongs_to_collection': {'id': 10194,
  'name': 'Toy Story Collection',
  'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
  'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'},
 'budget': 30000000,
 'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 10751, 'name': 'Family'},
  {'id': 35, 'name': 'Comedy'}],
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': 862,
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 113.301,
 'poster_path': '/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg',
 'production_companies': [{'id'

In [24]:
responses_metadata[20273]["release_date"] = "1995-10-30"


In [35]:
responses_metadata[0]

    

{'adult': False,
 'backdrop_path': '/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg',
 'belongs_to_collection': {'id': 10194,
  'name': 'Toy Story Collection',
  'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
  'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'},
 'budget': 30000000,
 'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 10751, 'name': 'Family'},
  {'id': 35, 'name': 'Comedy'}],
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': 862,
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 113.301,
 'poster_path': '/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg',
 'production_companies': [{'id'

In [19]:
import datetime
datetime.datetime.strptime('1995-10-30', "%Y-%m-%d").year

1995

In [20]:
responses_metadata[0]["genres"]

[{'id': 16, 'name': 'Animation'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 10751, 'name': 'Family'},
 {'id': 35, 'name': 'Comedy'}]

In [31]:
movie_genres = [[responses_metadata[i]["genres"][j]["name"] for j in range(len(responses_metadata[i]["genres"]))]
                for i in range(len(responses_metadata))]

release_date = [responses_metadata[i]["release_date"] for i in range(len(responses_metadata))]

titles = [responses_metadata[i]["original_title"] for i in range(len(responses_metadata))]

popularities = [responses_metadata[i]["popularity"] for i in range(len(responses_metadata))]

budget = [responses_metadata[i]["budget"] for i in range(len(responses_metadata))]

revenue = [responses_metadata[i]["revenue"] for i in range(len(responses_metadata))]

vote_average = [responses_metadata[i]["vote_average"] for i in range(len(responses_metadata))]

vote_count = [responses_metadata[i]["vote_count"] for i in range(len(responses_metadata))]

countries = [[responses_metadata[i]["production_countries"][j]["iso_3166_1"] for j in range(len(responses_metadata[i]["production_countries"]))]
                for i in range(len(responses_metadata))]

duration = [responses_metadata[i]["runtime"] for i in range(len(responses_metadata))]

studios = [[responses_metadata[i]["production_companies"][j]["name"] for j in range(len(responses_metadata[i]["production_companies"]))]
                for i in range(len(responses_metadata))]
df_movies = pd.DataFrame()

df_movies["tmdb_id"] = tmdb_ids
df_movies["movie_id"] = movie_ids
df_movies["popularity"] = popularities
df_movies["original_title"] = titles
df_movies["genres"] = movie_genres
df_movies["release_date"] = release_date
df_movies["budget"] = budget
df_movies["revenue"] = revenue
df_movies["country"] = countries
df_movies["duration"] = duration
df_movies["vote_count"] = vote_count
df_movies["vote_average"] = vote_average


In [36]:
list_countries = []
set_countries = set()
for response in responses_metadata:
    for country in response["production_countries"]:
        if country["iso_3166_1"] not in set_countries:
            list_countries += [[country["iso_3166_1"], country["name"]]]
            set_countries.add(country["iso_3166_1"])

In [32]:
df_movies.head()

Unnamed: 0,tmdb_id,movie_id,popularity,original_title,genres,release_date,budget,revenue,country,duration,vote_count,vote_average
0,862,1,113.301,Toy Story,"[Animation, Adventure, Family, Comedy]",1995-10-30,30000000,373554033,[US],81,16204,7.965
1,8844,2,20.077,Jumanji,"[Adventure, Fantasy, Family]",1995-12-15,65000000,262821940,[US],104,9367,7.235
2,15602,3,13.274,Grumpier Old Men,"[Romance, Comedy]",1995-12-22,25000000,71500000,[US],101,316,6.441
3,31357,4,10.773,Waiting to Exhale,"[Comedy, Drama, Romance]",1995-12-22,16000000,81452156,[US],127,128,6.3
4,11862,5,13.333,Father of the Bride Part II,"[Comedy, Family]",1995-12-08,0,76594107,[US],106,621,6.228


### Crew and cast

In [14]:
%%time

# index = 18565 problem -> no credits

responses_crew_cast = ManyRequests(n_workers=30, n_connections=30, json=True)(
        method='GET',
        url=[f"https://api.themoviedb.org/3/movie/{ID}/credits?api_key={API_KEY}&language=en-US" 
         for ID in tmdb_ids])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))






CPU times: user 6min 35s, sys: 25.7 s, total: 7min 1s
Wall time: 12min 44s


In [62]:
responses_crew_cast_copy = responses_crew_cast.copy()
del responses_crew_cast[18565]

In [64]:
keys = ["id", "name", "original_name", "gender", "popularity"]
attendees = []
for i in range(len(responses_crew_cast)):
    for actor in responses_crew_cast[i]["cast"]:
        attendee = {"tmdb_movie_id":responses_crew_cast[i]["id"], "job":"Actor"}
        for key in keys:
            attendee[key] = actor[key]
        attendees.append(attendee)
    for crew_member in responses_crew_cast[i]["crew"]:
        if crew_member["job"] != "Director":
            continue
        attendee = {"tmdb_movie_id":responses_crew_cast[i]["id"], "job":"Director"}
        for key in keys:
            attendee[key] = crew_member[key]
        attendees.append(attendee)





In [65]:
df_crew_cast = pd.DataFrame(attendees)
df_crew_cast.head()

Unnamed: 0,tmdb_movie_id,job,id,name,original_name,gender,popularity
0,862,Actor,31,Tom Hanks,Tom Hanks,2,99.612
1,862,Actor,12898,Tim Allen,Tim Allen,2,27.736
2,862,Actor,7167,Don Rickles,Don Rickles,2,12.536
3,862,Actor,12899,Jim Varney,Jim Varney,2,15.309
4,862,Actor,12900,Wallace Shawn,Wallace Shawn,2,22.432


### Ratings

In [17]:
movie_ids_str = list(map(str, movie_ids))

with open(path_ratings, 'r', encoding = "utf8") as inp, open(path_relevant_ratings, 'w') as out:
    writer = csv.writer(out)
    writer.writerow(["user_id", "movie_id", "rating"])
    for row in csv.reader(inp):
        if row[1] in movie_ids_str:
            writer.writerow(row[:3])

## To csv

In [34]:
df_movies.to_csv(path_movies)
df_crew_cast.to_csv(path_crew_cast)

In [39]:
with open("countries.csv", "w") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["id", "name"])
    csv_writer.writerows(list_countries)

## Checking data

In [3]:
df_id_rel_ratings = pd.read_csv("./relevant_ratings.csv", usecols=[1], squeeze=True, header = None)

In [5]:
d = pd.DataFrame()
d["id"] = df_id_rel_ratings

In [131]:
# Checkin if all ids from ratings are in movie_ids
len(d[d["id"].isin(movie_ids)]) == len(d)

True

In [132]:
d2 = pd.DataFrame()
d2["id"] = movie_ids

ids_not_rated = d2[~d2["id"].isin(d["id"])]
len(d3) # number of movies not rated

3580

## Spliting data

In [18]:
with open("crew_cast.csv", "r") as f, open("actors.csv", "w", newline="") as actors, open("links_cast.csv", "w", newline="") as links:
    csv_reader = csv.reader(f)
    next(csv_reader)
    
    writer_actors = csv.writer(actors)
    writer_actors.writerow(["actor_id", "name", "original_name", "gender", "popularity"])
    
    writer_links = csv.writer(links)
    writer_links.writerow(["tmdb_movie_id", "actor_id"])
    
    ids = set()
    for row in csv_reader:
        if int(row[3]) not in ids and row[2] == "Actor" :
            ids.add(int(row[3]))
            writer_actors.writerow(row[3:])
        if row[2] == "Actor":
            writer_links.writerow([row[1], row[3]])
        

In [19]:
with open("crew_cast.csv", "r") as f, open("directors.csv", "w", newline="") as directors, open("links_crew.csv", "w", newline="") as links:
    csv_reader = csv.reader(f)
    next(csv_reader)
    
    writer_directors = csv.writer(directors)
    writer_directors.writerow(["director_id", "name", "original_name", "gender", "popularity"])
    
    writer_links = csv.writer(links)
    writer_links.writerow(["tmdb_movie_id", "director_id"])
    
    ids = set()
    for row in csv_reader:
        if int(row[3]) not in ids and row[2] == "Director" :
            ids.add(int(row[3]))
            writer_directors.writerow(row[3:])
        if row[2] == "Director":
            writer_links.writerow([row[1], row[3]])
        

In [43]:
with open("./genres.csv", "w", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['movie_id', 'genre'])
    for index, row in df_movies.iterrows():
        movie_id = row['movie_id']
        for genre in row['genres']:
            if (genre != ""):
                csv_writer.writerow([movie_id, genre])

In [45]:
with open("./movie_countries.csv", "w", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['movie_id', 'country'])
    for index, row in df_movies.iterrows():
        movie_id = row['movie_id']
        for country in row['country']:
            if (country != ""):
                csv_writer.writerow([movie_id, country])