In [92]:
import pandas as pd
import numpy as np
import plotly
import dataset
import statsmodels as sts
import matplotlib.pyplot as plt

# Dataset Loading

Keep only necessary dataset(s)

In [75]:
augmented_cmu = dataset.get_augmented_cmu()
imdb_df = dataset.get_imdb_dataset()

  imdb_title_basics = pd.read_csv('Data/imdb/title.basics.tsv', sep='\t')


### Community Labels

In [76]:
community_dict = {0:'Romantic Movies & Social Commentary (C0)', 1:'Diverse Drama & Action (C1)', 2:'Lighthearted Entertainment (C2)', 3:'Dark & Suspenseful Fiction (C3)', 4:'Historical & Cultural Narratives (C4)'}

movie_community = pd.read_csv('Output/cmu_community_assignment.csv').drop(columns=['Unnamed: 0'])
movie_community['community_label'] = movie_community['hard_assignment'].apply(lambda x: community_dict.get(x))
cmu_community = pd.merge(left=augmented_cmu, right=movie_community, on='movie_wikipedia_id', how='inner')

Get community assignments and labels

### Customize Dataset

To prevent merge errors later, we should always initialize the dataset used in this way. Then, everything below this cell is "guaranteed" to run.

##### Box Office Mojo Revenue Data

Source: scraped

In [77]:
boxofficemojo = dataset.get_boxofficemojo_dataset()
boxofficemojo = boxofficemojo[['imdb_id', 'budget', 'performance_worldwide']]
boxofficemojo.columns = ['imdb_id', 'budget', 'revenue']
boxofficemojo = boxofficemojo[(boxofficemojo['budget'].notnull()) | (boxofficemojo['revenue'].notnull())].reset_index(drop=True)
boxofficemojo = boxofficemojo.replace(np.nan, 0.0)

##### Kaggle Movie Revenue Data

Source: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [78]:
kaggle_movie = pd.read_csv('Data/kaggle_movies_dataset/movies_metadata.csv')
kaggle_movie = kaggle_movie[['imdb_id', 'budget', 'revenue']]
kaggle_movie['budget'] = kaggle_movie['budget'].apply(lambda x: float(x) if isinstance(x, int) or isinstance(x, float) or x.isdigit() else 0)
kaggle_movie['revenue'] = kaggle_movie['revenue'].apply(lambda x: float(x) if isinstance(x, int) or isinstance(x, float) or x.isdigit() else 0)
kaggle_movie = kaggle_movie[(kaggle_movie['budget'] > 0.0) |(kaggle_movie['revenue'] > 0.0)].reset_index(drop=True)

  kaggle_movie = pd.read_csv('Data/kaggle_movies_dataset/movies_metadata.csv')


##### CMU Revenue Data

Source: CMU Movie Corpus Dataset

In [79]:
revenue_df = augmented_cmu[['imdb_id', 'revenue']]
revenue_df.columns = ['imdb_id', 'revenue_z']
revenue_df = revenue_df[revenue_df['revenue_z'].notnull()]

##### Merging Revenue Data

In [80]:
# @Function: get prioritized revenue -> Box Office Mojo > Kaggle > CMU
def get_revenue(x):
    # Box Office Mojo -> most recent data
    if x.get('revenue_x') != 0.0:
        return x.get('revenue_x')
    # Kaggle Movie Dataset -> updated 2019
    elif x.get('revenue_y') != 0.0:
        return x.get('revenue_y')
    # CMU revenue data -> from 2015
    elif x.get('revenue_z') != 0.0:
        return x.get('revenue_z')
    # Did not find revenue
    return 0.0
    
# @Function: get prioritized revenue -> Box Office Mojo > Kaggle > CMU
def get_budget(x):
    # Box Office Mojo -> most recent data
    if x.get('budget_x') != 0.0:
        return x.get('budget_x')
    # Kaggle Movie Dataset -> updated 2019
    elif x.get('budget_y') != 0.0:
        return x.get('budget_y')
    # Did not find revenue
    return 0.0

In [81]:
xrevenue = pd.merge(left=boxofficemojo, right=kaggle_movie, on='imdb_id', how='outer')
xrevenue = pd.merge(left=xrevenue, right=revenue_df, on='imdb_id', how='outer')
xrevenue = xrevenue.replace(np.nan, 0.0)

xrevenue['revenue'] = xrevenue.apply(get_revenue, axis=1)
xrevenue['budget'] = xrevenue.apply(get_budget, axis=1)

xrevenue = xrevenue[['imdb_id', 'budget', 'revenue']]

In [91]:
revenue_df = cmu_community.copy()[['imdb_id', 'imdb_name', 'hard_assignment', 'runtime','languages','countries','plot_summary','word_count','char_count',
                                  'avg_word_length','sentence_count','lexical_diversity','sentiment_polarity','topic','mood',
                                  'target_audience','temporal_setting','location_setting',
                                  ]]
revenue_df['community_labels'] = revenue_df['hard_assignment'].apply(lambda x: community_dict.get(x))

# MERGE FOR IMDB DATASET
revenue_df = pd.merge(left=revenue_df, right=imdb_df, how='inner', on='imdb_id')

# MERGE FOR REVENUE
revenue_df = pd.merge(left=revenue_df, right=xrevenue, how='inner', on='imdb_id')

# Basic Visualizations

Some basic dataviz to illustrate distributions, label counts, etc. Just to describe the dataset

(7796, 29)

# Plot Feature Analysis

Here, we can hopefully include some of the work done by the russians

In [84]:
# Insert code here

# Map Plotting

Define and describe plot

In [85]:
# Insert code here

# Hypothesis Testing

Define and complete hypothesis testing

In [86]:
# Insert code here