In [1]:
import pandas as pd
import numpy as np
import dataset
import statsmodels as sts
import matplotlib.pyplot as plt
import seaborn as sns
import communities
import plots
import json

In [2]:
augmented_cmu = dataset.get_augmented_cmu()
imdb_df = dataset.get_imdb_dataset()
boxofficemojo = dataset.get_boxofficemojo_dataset()
rottentomatoes = dataset.get_rottentomatoes_dataset()
awards_df = dataset.load_dataset('Output/awards.csv')

  imdb_title_basics = pd.read_csv('Data/imdb/title.basics.tsv', sep='\t')


In [3]:
imdb_df = imdb_df.drop(columns=['title_type', 'genres', 'runtime_minutes', 'movie_name', 'release_year'])

We need to use re-labeled llm features (fixed couple of issues). The code for datasets loading is too complex to change it, so let's just merge it with re-labeled llm features.

In [4]:
plot_features = pd.read_csv('plot_features.csv')
llm_columns = ['topic', 'mood', 'target_audience', 'temporal_setting', 'location_setting']

augmented_cmu = pd.merge(
    augmented_cmu.drop(columns=llm_columns), 
    plot_features[['movie_wikipedia_id'] + llm_columns],
    on='movie_wikipedia_id'
)

In [5]:
augmented_cmu[['languages', 'countries', 'genres', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting']] = augmented_cmu[
            ['languages', 'countries', 'genres', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting']
    ].map(json.loads, na_action='ignore')

In [6]:
augmented_cmu[['languages', 'countries', 'genres']] = \
    augmented_cmu[['languages', 'countries', 'genres']].map(
        lambda x: [y.lower() for y in x.values()], na_action='ignore')

In [7]:
set(y for x in augmented_cmu['topic'].dropna() for y in x) - \
    set(y for x in augmented_cmu['genres'].dropna() for y in x)

{'conflict', 'historical', 'romance'}

Topics are just genres. Let's merge them into genres

In [8]:
augmented_cmu['genres'] = (augmented_cmu['genres'] + augmented_cmu['topic'])\
    .map(lambda x: list(set(x)), na_action='ignore')
augmented_cmu = augmented_cmu.drop(columns='topic')

In [9]:
augmented_cmu['language'].unique()

array(['en'], dtype=object)

In [10]:
augmented_cmu = augmented_cmu.drop(columns='language')

In [11]:
augmented_cmu = augmented_cmu[
    ~augmented_cmu[['genres', 'mood', 'target_audience', 'temporal_setting', 'location_setting']]\
    .isna().any(axis=1)]

In [12]:
plot_analysis = pd.merge(
    left=augmented_cmu,
    right=imdb_df,
    how='inner', on='imdb_id')
awards_aggregated = awards_df.groupby('imdb_id').agg({'award': list,'category': list,'win': list}).reset_index()

plot_analysis = pd.merge(left=plot_analysis, right=awards_aggregated, how='left', on='imdb_id')
plot_analysis = pd.merge(left=plot_analysis, right=boxofficemojo, how='left', on='imdb_id')

In [13]:
plot_analysis.to_csv('plot_analysis.csv', index=None)

In [14]:
plot_analysis.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'word_count', 'char_count', 'avg_word_length',
       'sentence_count', 'lexical_diversity', 'sentiment_polarity', 'imdb_id',
       'imdb_name', 'imdb_year', 'mood', 'target_audience', 'temporal_setting',
       'location_setting', 'is_adult', 'avg_rating', 'num_votes', 'award',
       'category', 'win', 'domestic_distributor', 'domestic_opening', 'budget',
       'releases', 'performance_domestic', 'performance_international',
       'performance_worldwide', 'metric_roi', 'percentage_domestic',
       'percentage_international'],
      dtype='object')

In [15]:
augmented_cmu.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'word_count', 'char_count', 'avg_word_length',
       'sentence_count', 'lexical_diversity', 'sentiment_polarity', 'imdb_id',
       'imdb_name', 'imdb_year', 'mood', 'target_audience', 'temporal_setting',
       'location_setting'],
      dtype='object')

In [16]:
plot_analysis['performance_worldwide'].notna().sum()

6442

# Regression analysis for reviews

In [17]:
plot_analysis['avg_rating'].notna().mean()

1.0

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer

In [19]:
def one_hot_encode_multilabel(df, column_name):
    """
    Applies one-hot encoding to a column with multilabel classes in a DataFrame.

    :param df: pandas DataFrame containing the column to encode.
    :param column_name: string name of the column that contains multilabel classes.
    :return: DataFrame with original column replaced by one-hot encoded columns.
    """
    df = df.reset_index(drop=True)
    mlb = MultiLabelBinarizer()
    
    encoded_data = mlb.fit_transform(df[column_name])
    encoded_df = pd.DataFrame(encoded_data, columns=[f'{column_name}_{class_}' for class_ in mlb.classes_])
    
    df = df.drop(column_name, axis=1).join(encoded_df)
    
    return df

In [20]:
plot_analysis[['release_year', 'runtime', 'countries', 'genres']]

Unnamed: 0,release_year,runtime,countries,genres
0,2001,98.0,[united states of america],"[science fiction, space western, horror, adven..."
1,1987,110.0,[united kingdom],"[psychological thriller, erotic thriller, myst..."
2,1983,106.0,[germany],[drama]
3,2002,86.0,[south africa],"[world cinema, family film, adventure, fantasy]"
4,1997,93.0,[united states of america],"[comedy, comedy-drama, drama, romance, ensembl..."
...,...,...,...,...
18208,2000,106.0,"[france, united states of america]","[drama, parody, comedy, americana]"
18209,1993,107.0,[united states of america],"[psychological thriller, erotic thriller, cour..."
18210,1994,,[india],"[drama, action, crime, comedy]"
18211,2011,120.0,[united states of america],"[drama, science fiction, mystery]"


In [21]:
plot_analysis.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'word_count', 'char_count', 'avg_word_length',
       'sentence_count', 'lexical_diversity', 'sentiment_polarity', 'imdb_id',
       'imdb_name', 'imdb_year', 'mood', 'target_audience', 'temporal_setting',
       'location_setting', 'is_adult', 'avg_rating', 'num_votes', 'award',
       'category', 'win', 'domestic_distributor', 'domestic_opening', 'budget',
       'releases', 'performance_domestic', 'performance_international',
       'performance_worldwide', 'metric_roi', 'percentage_domestic',
       'percentage_international'],
      dtype='object')

In [22]:
one_hotted = one_hot_encode_multilabel(plot_analysis, 'mood')

In [23]:
import statsmodels.formula.api as smf

In [24]:
one_hotted.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'word_count', 'char_count', 'avg_word_length',
       'sentence_count', 'lexical_diversity', 'sentiment_polarity', 'imdb_id',
       'imdb_name', 'imdb_year', 'target_audience', 'temporal_setting',
       'location_setting', 'is_adult', 'avg_rating', 'num_votes', 'award',
       'category', 'win', 'domestic_distributor', 'domestic_opening', 'budget',
       'releases', 'performance_domestic', 'performance_international',
       'performance_worldwide', 'metric_roi', 'percentage_domestic',
       'percentage_international', 'mood_dark', 'mood_dramatic',
       'mood_exciting', 'mood_fantastical', 'mood_inspirational',
       'mood_lighthearted', 'mood_romantic'],
      dtype='object')

In [26]:
def fit_regression_model(df, feature_columns, target_feature):
    """
    Fits a regression model using one-hot encoding for categorical features with multilabel classes.
    
    :param df: pandas DataFrame containing the data.
    :param feature_columns: list of feature column names to include in the model.
    :param target_feature: the name of the target feature for regression.
    """
    
    # Create a copy of the DataFrame to avoid modifying the original one
    df_copy = df.copy()
    
    # Process each feature column
    formula_parts = []
    for column in feature_columns:
        # Check if the column is categorical by checking if the data type is object or category
        if df_copy[column].dtype == 'O' or df_copy[column].dtype.name == 'category':
            # Apply one-hot encoding to the column
            df_copy = one_hot_encode_multilabel(df_copy, column)
            # Update formula with one-hot encoded columns
            encoded_columns = [col for col in df_copy.columns if col.startswith(f'{column}_')]
            formula_parts.extend([f'C({col})' for col in encoded_columns])
        else:
            # For numerical columns, just add the column name to the formula
            formula_parts.append(column)
    
    # Construct the formula for the regression model
    formula = f'{target_feature} ~ ' + ' + '.join(formula_parts)
    
    # Fit the regression model
    mod = smf.ols(formula=formula, data=df_copy)
    res = mod.fit()
    
    # Print the summary of the regression model
    print(res.summary())

In [28]:
fit_regression_model(plot_analysis, ['runtime', 'mood'], 'avg_rating')

                            OLS Regression Results                            
Dep. Variable:                revenue   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                     79.73
Date:                Sun, 17 Dec 2023   Prob (F-statistic):          3.36e-123
Time:                        21:41:38   Log-Likelihood:                -85125.
No. Observations:                4253   AIC:                         1.703e+05
Df Residuals:                    4244   BIC:                         1.703e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept           