We first Start by enriching our Dataset by adding some features to some of the already existing movies in the [CMU movie dataset](http://www.cs.cmu.edu/~ark/personas/), and also adding other movies from the [TMDB dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data)

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf
import json

In [18]:
PATH = 'cmu/'
tmdb_dataset = pd.read_csv(PATH + 'TMDB_movie_dataset_v11.csv', sep=',', header=0)   # Load the raw TMDB dataset


movies = pd.read_csv(PATH + 'movie.metadata.tsv', sep='\t', header=None) # Load the raw cmu movie dataset
movies.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
movies = movies.dropna(subset=['Movie name'])
movies['Movie name'] = movies['Movie name'].str.lower()

characters = pd.read_csv(PATH + 'character.metadata.tsv', sep='\t', header=None) # Load the raw cmu character dataset
characters.columns = ['Wikipedia movie ID', 'Freebase movie ID','Movie release date', 'Character Name', 'Actor DOB', 'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor Name', 'Actor age at movie release', 'Freebase character map', 'Freebase character ID', 'Freebase actor ID']

names = pd.read_csv(PATH + 'name.clusters.txt', sep='\t', header=None) # Load the raw cmu name dataset
names.columns = ['Character Name', 'Freebase actor ID']

tvtropes = pd.read_csv(PATH + 'tvtropes.clusters.txt', sep='\t', header=None) # Load the raw cmu tvtropes dataset
tvtropes.columns = ['Trope', 'Movie name']

In [19]:
tmdb = tmdb_dataset.dropna()  # Drop rows with NaN values
tmdb = tmdb.drop(columns=['backdrop_path','homepage', 'poster_path']) # Drop columns that are not needed
tmdb = tmdb.rename(columns={'original_title': 'Movie name'}) # Rename column to match other datasets

In [41]:
def set_up_unique_classes(dataset, column):
    dataset = dataset.copy()
    unique_classes = dataset[column].unique()
    unique_classes_dict = {}
    for i, unique_class in enumerate(unique_classes):
        unique_classes_dict[unique_class] = i
    dataset[column + "_unique"] = dataset[column].map(lambda x: unique_classes_dict[x])
    return dataset

In [48]:
def enriche_dataset(tmdb, movies):
    """This function enriches the cmu dataset with the TMDB dataset, adding information about the movies that are in both datasets. and adding other movies that are not in the cmu movie dataset.

    Args:
        tmdb (pd.DataFrame): The TMDB dataset
        movies (pd.DataFrame): The CMU movie dataset

    Returns:
        pd.DataFrame: The enriched dataset
    """
    df1 = tmdb.copy()
    df1['Movie name'] = df1['Movie name'].str.lower()
    df2 = movies['Movie name'].str.lower()
    #Our original dataset has 2821 movies with enriched data from TMDB
    inclueded = df1[df1['Movie name'].isin(df2)]
    #We add other movies to enrich the dataset
    not_inclueded = df1[~df1['Movie name'].isin(df2)]
    #Concatenate both datasets
    res = pd.concat([inclueded, not_inclueded])
    res['net_revenue'] = tmdb['revenue'] - tmdb['budget']
    n_budget = tmdb['budget'].apply(lambda x: 1 if x == 0 else x)
    res['revenue/budget'] = tmdb['revenue'] / n_budget
    
    #Set up a list of ordered languages
    res = set_up_unique_classes(res, 'original_language')    
    #Get the release year
    res['release_date'] = res['release_date'].apply(lambda x: int(x[:4]))
    #Separate the spoken languages into an array
    res['spoken_languages'] = res['spoken_languages'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Add the number of languages
    res['n_languages'] = res['spoken_languages'].apply(lambda x: len(x))
    #Separate the genres into an array
    res['genres'] = res['genres'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the production companies into an array
    res['production_companies'] = res['production_companies'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the production countries into an array
    res['production_countries'] = res['production_countries'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Separate the keywords into an array
    res['keywords'] = res['keywords'].apply(lambda x: [y.strip() for y in x.split(',')])
    #Explode the dataset
    exploded = res.copy()
    langugaes_exploded = exploded.explode(column='spoken_languages')
    genres_exploded = exploded.explode(column='genres')
    production_companies_exploded = exploded.explode(column='production_companies')
    production_countries_exploded = exploded.explode(column='production_countries')
    keywords_exploded = exploded.explode(column='keywords')
    
    return res, langugaes_exploded, genres_exploded, production_companies_exploded, production_countries_exploded, keywords_exploded

In [49]:
enriched, langugaes_exploded, genres_exploded, production_companies_exploded, production_countries_exploded, keywords_exploded = enriche_dataset(tmdb, movies)
enriched


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,tagline,genres,production_companies,production_countries,spoken_languages,keywords,net_revenue,revenue/budget,original_language_unique,n_languages
0,27205,Inception,8.364,34495,Released,2010,825532764,148,False,160000000,...,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]","[rescue, mission, dream, airplane, paris, fran...",665532764,5.159580,0,4
1,157336,Interstellar,8.417,32571,Released,2014,701729206,169,False,165000000,...,Mankind was born on Earth. It was never meant ...,"[Adventure, Drama, Science Fiction]","[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]",[English],"[rescue, future, spacecraft, race against time...",536729206,4.252904,0,1
2,155,The Dark Knight,8.512,30619,Released,2008,1004558444,152,False,185000000,...,Welcome to a world without rules.,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[English, Mandarin]","[joker, sadism, chaos, secret identity, crime ...",819558444,5.430046,0,2
3,19995,Avatar,7.573,29815,Released,2009,2923706026,162,False,237000000,...,Enter the world of Pandora.,"[Action, Adventure, Fantasy, Science Fiction]","[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[English, Spanish]","[future, society, culture clash, space travel,...",2686706026,12.336312,0,2
4,24428,The Avengers,7.710,29166,Released,2012,1518815515,143,False,220000000,...,Some assembly required.,"[Science Fiction, Action, Adventure]",[Marvel Studios],[United States of America],"[English, Hindi, Russian]","[new york city, superhero, shield, based on co...",1298815515,6.903707,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127826,728503,Tribe: The Untold Story of the Making of Vice ...,0.000,0,Released,2020,0,10,False,8000,...,Don't forget the VENGEANCE,"[Comedy, Action, Drama]","[Reel Green Pictures, Manuel Alejandro Films]",[United States of America],[English],"[gun, movie business, writing, satire, tribe, ...",-8000,0.000000,0,1
1127939,728965,Life After Death: Quantum Realms,0.000,0,Released,2017,0,60,False,1000000,...,Soul continues on a quantum level – scientists...,[Documentary],[Reality Entertainment],[United States of America],[English],"[metaphysics, new age, religion, spirtuality]",-1000000,0.000000,0,1
1128221,727782,A Helpless Love Song,0.000,0,Released,2018,0,96,False,0,...,This was supposed to be the last sex of my lif...,"[Drama, Romance]",[THE KLOCKWORX],[Japan],[Japanese],[prostitute],0,0.000000,2,1
1128807,730212,Kilometer 147,0.000,0,Released,2018,0,20,False,0,...,A Desert Story,[Comedy],[Golden Cinema],[Israel],[Hebrew],"[israel, camel, desert]",0,0.000000,38,1


In [24]:
stats.pearsonr(enriched['vote_average'], enriched['popularity'])

PearsonRResult(statistic=0.09417753824545633, pvalue=1.7488885694172428e-20)

In [61]:
stats.pearsonr(enriched['vote_average'], enriched['net_revenue'])

PearsonRResult(statistic=0.13346512275755285, pvalue=1.201312281950169e-39)

In [60]:
stats.pearsonr(enriched['runtime'], enriched['net_revenue'])

PearsonRResult(statistic=0.19954115518406365, pvalue=2.3929095362817205e-87)

In [63]:
stats.pearsonr(enriched['n_languages'], enriched['net_revenue'])

PearsonRResult(statistic=0.10720133254738899, pvalue=4.2608821593460513e-26)

In [65]:
stats.pearsonr(enriched['original_language_unique'], enriched['net_revenue'])

PearsonRResult(statistic=-0.06730680581687601, pvalue=3.529141684360817e-11)

In [40]:
def standardize(df, columns):
    """Standardize a column in a dataframe

    Args:
        df (pd.DataFrame): The dataframe we want to standardize
        columns (array): The columns we want to standardize

    Returns:
        pd.DataFrame: The dataframe with the standardized column
    """
    df = df.copy()
    for column in columns:
        df[column] = (df[column] - df[column].mean()) / df[column].std()
    return df

In [33]:
def log_columns(dataset, columns):
    dataset = dataset.copy()
    for column in columns:
        dataset[column] = dataset[column].apply(lambda x: np.log(x) if x > 0 else 0)
    return dataset

In [37]:
def pre_process_for_reg(tmdb):
    """This function preprocesses the dataset for regression analysis

    Args:
        tmdb (pd.DataFrame): The TMDB dataset
        
    Returns:
        pd.DataFrame: The preprocessed dataset
    """
    tmdb = tmdb.copy()
    to_log = ['n_languages', 'net_revenue', 'runtime', 'release_date', 'budget']
    tmdb = log_columns(tmdb, to_log)
    columns_to_standize =['vote_average', 'net_revenue', 'runtime', 'budget']
    tmdb = standardize(tmdb, columns_to_standize)
    return tmdb

In [53]:
df_tmdb = pre_process_for_reg(enriched)
df_tmdb

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,tagline,genres,production_companies,production_countries,spoken_languages,keywords,net_revenue,revenue/budget,original_language_unique,n_languages
0,27205,Inception,1.157451,34495,Released,7.605890,825532764,0.845268,False,1.622237,...,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]","[rescue, mission, dream, airplane, paris, fran...",2.052257,5.159580,0,1.386294
1,157336,Interstellar,1.182773,32571,Released,7.607878,701729206,1.040076,False,1.626174,...,Mankind was born on Earth. It was never meant ...,"[Adventure, Drama, Science Fiction]","[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]",[English],"[rescue, future, spacecraft, race against time...",2.024017,4.252904,0,0.000000
2,155,The Dark Knight,1.228162,30619,Released,7.604894,1004558444,0.884422,False,1.640811,...,Welcome to a world without rules.,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[English, Mandarin]","[joker, sadism, chaos, secret identity, crime ...",2.079588,5.430046,0,0.693147
3,19995,Avatar,0.779529,29815,Released,7.605392,2923706026,0.977969,False,1.672501,...,Enter the world of Pandora.,"[Action, Adventure, Fantasy, Science Fiction]","[Dune Entertainment, Lightstorm Entertainment,...","[United States of America, United Kingdom]","[English, Spanish]","[future, society, culture clash, space travel,...",2.235467,12.336312,0,0.693147
4,24428,The Avengers,0.844985,29166,Released,7.606885,1518815515,0.794810,False,1.662978,...,Some assembly required.,"[Science Fiction, Action, Adventure]",[Marvel Studios],[United States of America],"[English, Hindi, Russian]","[new york city, superhero, shield, based on co...",2.140039,6.903707,0,1.098612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127826,728503,Tribe: The Untold Story of the Making of Vice ...,-2.838676,0,Released,7.610853,0,-3.110942,False,0.355244,...,Don't forget the VENGEANCE,"[Comedy, Action, Drama]","[Reel Green Pictures, Manuel Alejandro Films]",[United States of America],[English],"[gun, movie business, writing, satire, tribe, ...",-0.615010,0.000000,0,0.000000
1127939,728965,Life After Death: Quantum Realms,-2.838676,0,Released,7.609367,0,-0.480308,False,0.972950,...,Soul continues on a quantum level – scientists...,[Documentary],[Reality Entertainment],[United States of America],[English],"[metaphysics, new age, religion, spirtuality]",-0.615010,0.000000,0,0.000000
1128221,727782,A Helpless Love Song,-2.838676,0,Released,7.609862,0,0.209744,False,-0.794524,...,This was supposed to be the last sex of my lif...,"[Drama, Romance]",[THE KLOCKWORX],[Japan],[Japanese],[prostitute],-0.615010,0.000000,2,0.000000
1128807,730212,Kilometer 147,-2.838676,0,Released,7.609862,0,-2.093274,False,-0.794524,...,A Desert Story,[Comedy],[Golden Cinema],[Israel],[Hebrew],"[israel, camel, desert]",-0.615010,0.000000,38,0.000000


In [56]:
mod = smf.ols(formula="net_revenue ~ budget + vote_average + vote_average+ release_date + n_languages + original_language_unique + adult + n_languages", data=df_tmdb)

print(mod.fit().summary())

                            OLS Regression Results                            
Dep. Variable:            net_revenue   R-squared:                       0.345
Model:                            OLS   Adj. R-squared:                  0.344
Method:                 Least Squares   F-statistic:                     845.8
Date:                Tue, 12 Nov 2024   Prob (F-statistic):               0.00
Time:                        17:29:08   Log-Likelihood:                -11669.
No. Observations:                9662   AIC:                         2.335e+04
Df Residuals:                    9655   BIC:                         2.340e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               