# Importing data

In [1]:
import requests as r
import json
import pandas as pd
from many_requests import ManyRequests
import numpy as np
import csv

In [2]:
# Files to be imported
path_links = "./Data/links.csv"
path_imdb_movies = "./Data/movie_ids.json"
path_ratings = "./Data/ratings.csv"

# Files to be created
path_relevant_ratings = "./created/relevant_ratings_comp.csv"
path_movies = "./created/movies.csv"
path_crew_cast = "./created/crew_cast.csv"

NMOVIES = 50000

API_KEY = "341aa223bceaf3285598e2a6511bcd4b"

## Importing movies and movie IDs

### Movies

In [3]:
df_movies = pd.read_json(path_imdb_movies, lines = True)
df_movies.rename(columns = {"id":"tmdb_id"}, inplace = True)
df_movies.head()

Unnamed: 0,adult,tmdb_id,original_title,popularity,video
0,False,3924,Blondie,2.425,False
1,False,6124,Der Mann ohne Namen,0.961,False
2,False,8773,L'Amour à vingt ans,2.746,False
3,False,25449,New World Disorder 9: Never Enough,1.562,False
4,False,31975,Sesame Street: Elmo Loves You!,1.4,True


In [4]:
# L'id 470358 fournit une erreur lors des requêtes HTTP
df_movies = df_movies.drop(df_movies[df_movies["tmdb_id"] == 470358].index[0])

### Movie IDs

In [5]:
df_links = pd.read_csv(path_links)
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
# Cleaning Null and duplicates values, switching tmdbId to integer
df_links = df_links.dropna()
df_links["tmdbId"] = df_links["tmdbId"].astype(int)
df_links.drop_duplicates(subset = ["tmdbId"], inplace = True)
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [7]:
df_valid_links = df_links[df_links["tmdbId"].isin(df_movies["tmdb_id"])]
df_valid_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [8]:
tmdb_ids = df_valid_links["tmdbId"].head(NMOVIES).to_numpy()
movie_ids = df_valid_links["movieId"].head(NMOVIES).to_numpy()

In [9]:
len(tmdb_ids)

50000

## Importing movie metadata, crew and cast, ratings

### Metadata

In [11]:
%%time

responses_metadata = ManyRequests(n_workers=30, n_connections=30, json=True)(
    method='GET',
    url=[f"https://api.themoviedb.org/3/movie/{ID}?api_key={API_KEY}&language=en-US" for ID in tmdb_ids])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

  class ExceptionGroup(BaseExceptionGroup, trio.MultiError):


CPU times: user 6min 6s, sys: 20.6 s, total: 6min 27s
Wall time: 11min 22s


In [12]:
movie_genres = [[responses_metadata[i]["genres"][j]["name"] for j in range(len(responses_metadata[i]["genres"]))]
                for i in range(len(responses_metadata))]

movie_release_dates = [responses_metadata[i]["release_date"] for i in range(len(responses_metadata))]

titles = [responses_metadata[i]["original_title"] for i in range(len(responses_metadata))]

popularities = [responses_metadata[i]["popularity"] for i in range(len(responses_metadata))]

duration = [responses_metadata[i]["runtime"] for i in range(len(responses_metadata))]

studios = [[responses_metadata[i]["production_companies"][j]["name"] for j in range(len(responses_metadata[i]["production_companies"]))]
                for i in range(len(responses_metadata))]


df_movies = pd.DataFrame()

df_movies["tmdb_id"] = tmdb_ids
df_movies["movie_id"] = movie_ids
df_movies["popularity"] = popularities
df_movies["original_title"] = titles
df_movies["genres"] = movie_genres
df_movies["release_date"] = movie_release_dates
df_movies["studios"] = studios
df_movies["duration"] = duration




In [13]:
df_movies.head()

Unnamed: 0,tmdb_id,movie_id,popularity,original_title,genres,release_date,studios,duration
0,862,1,121.584,Toy Story,"[Animation, Adventure, Family, Comedy]",1995-10-30,[Pixar],81
1,8844,2,15.254,Jumanji,"[Adventure, Fantasy, Family]",1995-12-15,"[TriStar Pictures, PolyGram Filmed Entertainme...",104
2,15602,3,12.317,Grumpier Old Men,"[Romance, Comedy]",1995-12-22,"[Warner Bros. Pictures, Lancaster Gate]",101
3,31357,4,14.013,Waiting to Exhale,"[Comedy, Drama, Romance]",1995-12-22,[20th Century Fox],127
4,11862,5,16.076,Father of the Bride Part II,"[Comedy, Family]",1995-12-08,"[Sandollar Productions, Touchstone Pictures]",106


### Crew and cast

In [14]:
%%time

# index = 18565 problem -> no credits

responses_crew_cast = ManyRequests(n_workers=30, n_connections=30, json=True)(
        method='GET',
        url=[f"https://api.themoviedb.org/3/movie/{ID}/credits?api_key={API_KEY}&language=en-US" 
         for ID in tmdb_ids])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

CPU times: user 6min 32s, sys: 26.9 s, total: 6min 59s
Wall time: 12min 38s


In [15]:
responses_crew_cast_copy = responses_crew_cast.copy()
del responses_crew_cast[18565]

In [16]:
keys = ["id", "name", "original_name", "gender", "popularity"]
attendees = []
for i in range(len(responses_crew_cast)):
    for actor in responses_crew_cast[i]["cast"]:
        attendee = {"tmdb_movie_id":responses_crew_cast[i]["id"], "job":"Actor"}
        for key in keys:
            attendee[key] = actor[key]
        attendees.append(attendee)
    for crew_member in responses_crew_cast[i]["crew"]:
        if crew_member["job"] != "Director":
            continue
        attendee = {"tmdb_movie_id":responses_crew_cast[i]["id"], "job":"Director"}
        for key in keys:
            attendee[key] = crew_member[key]
        attendees.append(attendee)

In [17]:
df_crew_cast = pd.DataFrame(attendees)
df_crew_cast.head()

Unnamed: 0,tmdb_movie_id,job,id,name,original_name,gender,popularity
0,862,Actor,31,Tom Hanks,Tom Hanks,2,93.816
1,862,Actor,12898,Tim Allen,Tim Allen,2,21.325
2,862,Actor,7167,Don Rickles,Don Rickles,2,9.493
3,862,Actor,12899,Jim Varney,Jim Varney,2,12.994
4,862,Actor,12900,Wallace Shawn,Wallace Shawn,2,16.509


### Ratings

In [18]:
movie_ids_str = list(map(str, movie_ids))

with open(path_ratings, 'r', encoding = "utf8") as inp, open(path_relevant_ratings, 'w') as out:
    writer = csv.writer(out)
    writer.writerow(["user_id", "movie_id", "rating"])
    for row in csv.reader(inp):
        if row[1] in movie_ids_str:
            writer.writerow(row[:3])

## To csv

In [77]:
df_movies.to_csv(path_movies, index = False)
df_crew_cast.to_csv(path_crew_cast)

## Checking data

In [39]:
df_id_rel_ratings = pd.read_csv(path_relevant_ratings)

In [40]:
df_id_rel_ratings

Unnamed: 0,user_id,movie_id,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27614754,283228,8542,4.5
27614755,283228,8712,4.5
27614756,283228,34405,4.5
27614757,283228,44761,4.5


In [41]:
d = pd.DataFrame()
d["id"] = df_id_rel_ratings["movie_id"]

In [42]:
# Checkin if all ids from ratings are in movie_ids
len(d[d["id"].isin(movie_ids)]) == len(d)

True

In [43]:
d2 = pd.DataFrame()
d2["id"] = movie_ids

ids_not_rated = d2[~d2["id"].isin(d["id"])]
len(ids_not_rated) # number of movies not rated

3580

## Spliting data

In [44]:
with open(path_crew_cast, "r") as f, open("./created/actors.csv", "w", newline="") as actors, open("./created/links_cast.csv", "w", newline="") as links:
    csv_reader = csv.reader(f)
    next(csv_reader)
    
    writer_actors = csv.writer(actors)
    writer_actors.writerow(["actor_id", "name", "original_name", "gender", "popularity"])
    
    writer_links = csv.writer(links)
    writer_links.writerow(["tmdb_movie_id", "actor_id"])
    
    ids = set()
    for row in csv_reader:
        if int(row[3]) not in ids and row[2] == "Actor" :
            ids.add(int(row[3]))
            writer_actors.writerow(row[3:])
        if row[2] == "Actor":
            writer_links.writerow([row[1], row[3]])
        

In [45]:
with open(path_crew_cast, "r") as f, open("./created/directors.csv", "w", newline="") as directors, open("./created/links_crew.csv", "w", newline="") as links:
    csv_reader = csv.reader(f)
    next(csv_reader)
    
    writer_directors = csv.writer(directors)
    writer_directors.writerow(["director_id", "name", "original_name", "gender", "popularity"])
    
    writer_links = csv.writer(links)
    writer_links.writerow(["tmdb_movie_id", "director_id"])
    
    ids = set()
    for row in csv_reader:
        if int(row[3]) not in ids and row[2] == "Director" :
            ids.add(int(row[3]))
            writer_directors.writerow(row[3:])
        if row[2] == "Director":
            writer_links.writerow([row[1], row[3]])
        