We first Start by enriching our Dataset by adding some features to some of the already existing movies in the [CMU movie dataset](http://www.cs.cmu.edu/~ark/personas/), and also adding other movies from the [TMDB dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import json

In [5]:
PATH = 'cmu/'
tmdb_dataset = pd.read_csv(PATH + 'TMDB_movie_dataset_v11.csv', sep=',', header=0)   # Load the raw TMDB dataset


movies = pd.read_csv(PATH + 'movie.metadata.tsv', sep='\t', header=None) # Load the raw cmu movie dataset
movies.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
movies = movies.dropna(subset=['Movie name'])
movies['Movie name'] = movies['Movie name'].str.lower()

characters = pd.read_csv(PATH + 'character.metadata.tsv', sep='\t', header=None) # Load the raw cmu character dataset
characters.columns = ['Wikipedia movie ID', 'Freebase movie ID','Movie release date', 'Character Name', 'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor Name', 'Actor age at movie release', 'Freebase character map', 'Freebase character ID', 'Freebase actor ID']

names = pd.read_csv(PATH + 'name.clusters.txt', sep='\t', header=None) # Load the raw cmu name dataset
names.columns = ['Character Name', 'Freebase actor ID']

tvtropes = pd.read_csv(PATH + 'tvtropes.clusters.txt', sep='\t', header=None) # Load the raw cmu tvtropes dataset
tvtropes.columns = ['Trope', 'Movie name']

In [6]:
tmdb = tmdb_dataset.dropna()  # Drop rows with NaN values
tmdb = tmdb.drop(columns=['backdrop_path','homepage', 'poster_path']) # Drop columns that are not needed
tmdb = tmdb.rename(columns={'original_title': 'Movie name'}) # Rename column to match other datasets

In [24]:
def enriche_dataset(tmdb, movies):
    """This function enriches the cmu dataset with the TMDB dataset, adding information about the movies that are in both datasets. and adding other movies that are not in the cmu movie dataset.

    Args:
        tmdb (pd.DataFrame): The TMDB dataset
        movies (pd.DataFrame): The CMU movie dataset

    Returns:
        pd.DataFrame: The enriched dataset
    """
    df1 = tmdb.copy()
    df1['Movie name'] = df1['Movie name'].str.lower()
    df2 = movies['Movie name'].str.lower()
    #Our original dataset has 2821 movies with enriched data from TMDB
    inclueded = df1[df1['Movie name'].isin(df2)]
    #We add other movies to enrich the dataset
    not_inclueded = df1[~df1['Movie name'].isin(df2)]
    #Concatenate both datasets
    res = pd.concat([inclueded, not_inclueded])
    res['net revenue'] = tmdb['revenue'] - tmdb['budget']
    n_budget = tmdb['budget'].apply(lambda x: 1 if x == 0 else x)
    res['revenue/budget'] = tmdb['revenue'] / n_budget
    #We create another dataset with the spoken languages, genres, production companies and production countries in separate rows
    #Separate the spoken languages into an array
    res['spoken_languages'] = res['spoken_languages'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the genres into an array
    res['genres'] = res['genres'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the production companies into an array
    res['production_companies'] = res['production_companies'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the production countries into an array
    res['production_countries'] = res['production_countries'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the keywords into an array
    res['keywords'] = res['keywords'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Explode the dataset
    exploded = res.copy()
    langugaes_exploded = exploded.explode(column='spoken_languages')
    genres_exploded = exploded.explode(column='genres')
    production_companies_exploded = exploded.explode(column='production_companies')
    production_countries_exploded = exploded.explode(column='production_countries')
    keywords_exploded = exploded.explode(column='keywords')
    return res, langugaes_exploded, genres_exploded, production_companies_exploded, production_countries_exploded, keywords_exploded

In [26]:
enriched, langugaes_exploded, genres_exploded, production_companies_exploded, production_countries_exploded, keywords_exploded = enriche_dataset(tmdb, movies)
keywords_exploded


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,keywords,net revenue,revenue/budget
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]",rescue,665532764,5.15958
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]",mission,665532764,5.15958
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]",dream,665532764,5.15958
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]",airplane,665532764,5.15958
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]",paris,665532764,5.15958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128807,730212,Kilometer 147,0.000,0,Released,2018-01-18,0,20,False,0,...,A camel's body lies in the middle of a desert ...,0.600,A Desert Story,[Comedy],[Golden Cinema],[Israel],[Hebrew],desert,0,0.00000
1129211,729492,The Float,0.000,0,Released,2019-05-21,0,35,False,26000,...,A dying man attempts to reconcile his fragment...,0.600,"This is the story of a man, a girl and a grave...","[Drama, Thriller]",[Project Violet Productions],[United Kingdom],[English],darkness,-26000,0.00000
1129211,729492,The Float,0.000,0,Released,2019-05-21,0,35,False,26000,...,A dying man attempts to reconcile his fragment...,0.600,"This is the story of a man, a girl and a grave...","[Drama, Thriller]",[Project Violet Productions],[United Kingdom],[English],breaking the fourth wall,-26000,0.00000
1129211,729492,The Float,0.000,0,Released,2019-05-21,0,35,False,26000,...,A dying man attempts to reconcile his fragment...,0.600,"This is the story of a man, a girl and a grave...","[Drama, Thriller]",[Project Violet Productions],[United Kingdom],[English],family drama,-26000,0.00000
