In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## Netflix

In [2]:
csv_file = "netflix.csv"
netflix_df = pd.read_csv(csv_file)
# netflix_df.info()
# netflix_df.columns.tolist()

In [3]:
netflix_df = netflix_df.loc[(netflix_df["Series or Movie"] == "Movie") & netflix_df["Country Availability"].str.contains("United States", na=False)]

In [4]:
netflix_df["Release Year"] = pd.to_datetime(netflix_df["Release Date"], format="%d %b %Y").dt.year.fillna(0).astype(int)

In [5]:
netflix_df["Genre"] = netflix_df["Genre"].str.split(",").str[0]

In [6]:
netflix_movies = netflix_df[["Title", "Release Year", "Genre"]].sort_values(by=["Title"]).reset_index(drop=True).fillna("None")
netflix_movies["Source"] = "Netflix"

In [7]:
netflix_ratings = netflix_df[["Title", "IMDb Score", "Rotten Tomatoes Score", "Metacritic Score"]].sort_values(by=["Title"]).reset_index(drop=True).fillna("None")
netflix_ratings.columns = netflix_ratings.columns.str.replace("Score", "Rating")

In [8]:
# netflix_movies.to_csv("output_data/netflix_movies.csv", index_label="Movie_ID", encoding="utf-8-sig")

## Prime Video

In [9]:
csv_file = "prime_video.csv"
prime_video_df = pd.read_csv(csv_file)
# prime_video_df.info()
# prime_video_df.columns.tolist()

In [10]:
prime_video_df = prime_video_df.loc[(prime_video_df["Language"] == "English")]

In [11]:
prime_video_movies = prime_video_df[["Movie Name", "Year of Release"]].sort_values(by=["Movie Name"]).reset_index(drop=True)
prime_video_movies["Year of Release"] = prime_video_movies["Year of Release"].str.replace("None", "0").astype(int)
prime_video_movies["Source"] = "Prime Video"
prime_video_movies.rename(columns={"Movie Name": "Title", "Year of Release": "Release Year"}, inplace=True)

In [12]:
prime_video_ratings = prime_video_df[["Movie Name", "IMDb Rating"]].sort_values(by=["Movie Name"]).reset_index(drop=True)
prime_video_ratings.rename(columns={"Movie Name": "Title"}, inplace=True)

## IMDb

In [13]:
csv_file = "imdb.csv"
imdb_df = pd.read_csv(csv_file, dtype={"year": str})

In [14]:
imdb_df = imdb_df[imdb_df["year"].str.len() == 4].reset_index(drop=True)

In [15]:
imdb_df["Release Year"] =  imdb_df["year"].astype(int)

In [16]:
imdb_df = imdb_df.loc[imdb_df["country"].str.contains("USA", na=False)]

In [17]:
imdb_df["Genre"] = imdb_df["genre"].str.split(",").str[0]

In [18]:
imdb_movies = imdb_df[["original_title", "Release Year", "Genre", "duration"]].sort_values(by=["original_title"]).reset_index(drop=True)
imdb_movies.rename(columns={"original_title": "Title", "duration": "Duration"}, inplace=True)

In [19]:
imdb_ratings = imdb_df[["original_title", "avg_vote"]].sort_values(by=["original_title"]).reset_index(drop=True)
imdb_ratings.rename(columns={"original_title": "Title", "avg_vote": "IMDb Rating"}, inplace=True)

## Rotten Tomatoes

In [20]:
csv_file = "rotten_tomatoes.csv"
tomatoes_df = pd.read_csv(csv_file)

In [21]:
tomatoes_df["Release Year"] = pd.to_datetime(tomatoes_df["original_release_date"], format="%Y-%m-%d").dt.year.fillna(0).astype(int)

In [22]:
tomatoes_df["Genre"] = tomatoes_df["genres"].str.split(",").str[0].str.split(" &").str[0]
tomatoes_movies = tomatoes_df[["movie_title", "Release Year", "Genre", "runtime"]].sort_values(by=["movie_title"]).reset_index(drop=True).fillna("None")
tomatoes_movies["Genre"] = tomatoes_movies["Genre"].str.replace("Science Fiction", "Sci-Fi")
tomatoes_movies.rename(columns={"movie_title": "Title", "runtime": "Duration"}, inplace=True)

In [23]:
tomatoes_ratings = tomatoes_df[["movie_title", "tomatometer_rating", "audience_rating"]].sort_values(by=["movie_title"]).reset_index(drop=True)
tomatoes_ratings.rename(columns={"movie_title": "Title", "tomatometer_rating": "RT Critic Rating", "audience_rating": "RT Audience Rating"}, inplace=True)
tomatoes_ratings["RT Critic Rating"] = tomatoes_ratings["RT Critic Rating"].fillna(0).astype(int)
tomatoes_ratings["RT Audience Rating"] = tomatoes_ratings["RT Audience Rating"].fillna(0).astype(int)

In [24]:
imdb_genres = np.sort(imdb_movies["Genre"].unique())
imdb_genres

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'], dtype=object)

In [25]:
netflix_genres = np.sort(netflix_movies["Genre"].unique())
netflix_genres

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'None',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller',
       'Western'], dtype=object)

In [26]:
tomatoes_genres = np.sort(tomatoes_movies["Genre"].unique())
tomatoes_genres

array(['Action', 'Animation', 'Art House', 'Classics', 'Comedy',
       'Cult Movies', 'Documentary', 'Drama', 'Horror', 'Kids', 'Musical',
       'Mystery', 'None', 'Romance', 'Sci-Fi', 'Special Interest',
       'Television', 'Western'], dtype=object)