# Results notebook

## Data preprocessing

In [None]:
import pandas as pd

In [None]:
def extract_year(date):
    if date:
        return str(date).split('-')[0]
    return None

#### CMU dataset

Link to download the data: https://www.cs.cmu.edu/~ark/personas/

In [None]:
df_cmu_movie_metadata = pd.read_csv('data/cmu/movie.metadata.tsv', sep='\t', header=None)
df_cmu_movie_metadata.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
df_cmu_movie_metadata.sample(5)

In [None]:
df_cmu_movie_metadata.info()

In [None]:
print("CMU Movie Metadata shape before filtering: ", df_cmu_movie_metadata.shape)

In [None]:
print("Number of movies with revenue information: ", df_cmu_movie_metadata[df_cmu_movie_metadata['revenue'].notnull()].shape)
# We need to update this column with updated information

In [None]:
print("Number of movies with release date information: ", df_cmu_movie_metadata[df_cmu_movie_metadata['release_date'].notnull()].shape)
df_cmu_movie_metadata.dropna(subset=['release_date'], inplace=True)

In [None]:
df_cmu_movie_metadata['release_year'] = df_cmu_movie_metadata['release_date'].apply(extract_year)
df_cmu_movie_metadata.head()

In [None]:
df_cmu_plot_summaries = pd.read_csv('data/cmu/plot_summaries.txt', sep='\t', header=None)
df_cmu_plot_summaries.columns = ['wiki_id', 'summary']
df_cmu_plot_summaries.sample(5)

In [None]:
df_cmu_character_metadata = pd.read_csv('data/cmu/character.metadata.tsv', sep='\t', header=None)
df_cmu_character_metadata.columns = [
    "wikipedia_movie_id", "freebase_movie_id", "release_date", "character_name",
    "actor_date_of_birth", "actor_gender", "actor_height_in_meters", "actor_ethnicity_freebase_id",
    "actor_name", "actor_age_at_movie_release", "freebase_character_actor_map_id",
    "freebase_character_id", "freebase_actor_id"
]

In [None]:
print("CMU Character Metadata shape before filtering: ", df_cmu_character_metadata.shape)

In [None]:
print("Number of movies with release date information: ", df_cmu_character_metadata[df_cmu_character_metadata['release_date'].notnull()].shape)
df_cmu_character_metadata.dropna(subset=['release_date'], inplace=True)

In [None]:
df_cmu_character_metadata['release_year'] = df_cmu_character_metadata['release_date'].apply(extract_year)
df_cmu_character_metadata.head()

#### Tropes dataset

Link to download the data: https://drive.google.com/file/d/1Duyz5ATlLHzwMidj15bWVnWHpdE4aRXn/view?usp=sharing

In [None]:
df_tropes = pd.read_csv('data/tropes/tropes.csv', index_col=0)
df_tropes.columns = ['trope_id', 'trope', 'description']
df_tropes.sample(5)

In [None]:
df_imdb_movie_tropes = pd.read_csv('data/tropes/film_imdb_match.csv', index_col=0)
df_imdb_movie_tropes.columns = ['title', 'trope', 'example', 'clean_title', 'tconst', 'trope_id', 'title_id']
df_imdb_movie_tropes = df_imdb_movie_tropes.drop(columns=['trope'])
df_imdb_movie_tropes.head()

In [None]:
df_imdb_movie_tropes = df_imdb_movie_tropes.merge(df_tropes, how='inner', left_on='trope_id', right_on='trope_id')
df_imdb_movie_tropes = df_imdb_movie_tropes[['tconst', 'title_id', 'clean_title', 'trope_id', 'trope', 'description', 'example']]
df_imdb_movie_tropes.rename(columns={'tconst': 'imdb_id'}, inplace=True)
df_imdb_movie_tropes.head()

#### IMDB dataset

Link to download the data: https://developer.imdb.com/non-commercial-datasets/, download the title.basics.tsv.gz file

In [None]:
df_imdb = pd.read_csv('data/imdb/title.basics.tsv', sep='\t')
df_imdb.sample(5)

#### TMDB dataset

Link to download the data: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies?resource=download

In [None]:
df_tmdb = pd.read_csv('data/tmdb/TMDB_movie_dataset_v11.csv')
df_tmdb.columns

In [None]:
df_tmdb.head()

In [None]:
df_tmdb['release_year'] = df_tmdb['release_date'].apply(extract_year)
df_tmdb.head()

In [None]:
# Check tmdb dataset shape before filtering
df_tmdb.shape

In [None]:
# Clean tmdb dataset before merging it with the cmu dataset

# Filter released movies
df_tmdb = df_tmdb[df_tmdb['status'] == 'Released']
print("Number of released movies in tmdb dataset: ", df_tmdb.shape)

In [None]:
# Drop movies with missing release date
df_tmdb.dropna(subset=['release_date'], inplace=True)
print("Number of movies with release date information: ", df_tmdb.shape)

##### Merge IMDB and Tropes datasets

In [None]:
# tropes with imdb ids matched
df_imdb_movie_tropes.head()

In [None]:
# merge imdb titles basics information with tropes
df_movie_tropes = pd.merge(df_imdb_movie_tropes, df_imdb, how='inner', left_on='imdb_id', right_on='tconst')

In [None]:
print("-------" * 10)
print(f"imdb shape: {df_imdb.shape}")
print(f"movie tropes imdb shape: {df_imdb_movie_tropes.shape}")
print(f"movie tropes merged with imdb dataset shape: {df_movie_tropes.shape}")
print("-------" * 10)

df_movie_tropes.head()

In [None]:
df_movie_tropes.to_csv('data/movie_tropes.csv', index=False)

#### Merge CMU and TMDB datasets

In [None]:
df_cmu_movie_metadata.columns

In [None]:
df_tmdb.columns

Merging the CMU and TMDB datasets by movie name and release year

In [None]:
# merge cmu movie metadata with tmdb dataset to fill in missing information such as revenue which has a lot of missing values
df_cmu_movie_metadata_selected = df_cmu_movie_metadata[['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_year']]
df_cmu_tmdb = pd.merge(
    df_tmdb,
    df_cmu_movie_metadata_selected,
    how='inner',
    left_on=['title','release_year'],
    right_on=['name', 'release_year']
)

print("-------" * 10)
print(f"CMU Movie Summary Corpus shape: {df_cmu_movie_metadata_selected.shape}")
print(f"TMDB shape: {df_tmdb.shape}")
print(f"CMU TMDB merged dataframe shape: {df_cmu_tmdb.shape}")
print("-------" * 10)

df_cmu_tmdb.head()

Before saving the data, inspect that column names, and non-null values are correct

In [None]:
df_cmu_tmdb.columns

In [None]:
df_cmu_tmdb.info()

In [None]:
# Remove movies with missing imdb because we need it for the tropes analysis
df_cmu_tmdb.dropna(subset=['imdb_id'], inplace=True)
df_cmu_tmdb.info()

In [None]:
df_cmu_tmdb.to_csv('data/cmu_tmdb.csv', index=False)

##### Merge CMU character, CMU movie and IMDB name.basics adatasets

In [None]:
df_imdb_name_basics =  pd.read_csv('data/imdb/name.basics.tsv', sep='\t', header=None)
df_imdb_name_basics.columns = ['nconst', 'primary_name', 'birth_year', 'death_year', 'primary_profession', 'known_for_titles']
df_imdb_name_basics.sample(5)

In [None]:
print(df_cmu_character_metadata.columns)
print(df_cmu_movie_metadata.columns)

# Merge character and movie metadata on movie id
df_cmu_movie_character = pd.merge(df_cmu_character_metadata, df_cmu_movie_metadata, on=['wikipedia_movie_id', 'freebase_movie_id', 'release_year'], how='inner')

# Merge the result with name_basics on actor id
df_cmu_imdb_name_basics = pd.merge(df_cmu_movie_character, df_imdb_name_basics, left_on='freebase_actor_id', right_on='nconst', how='left')

df_cmu_imdb_name_basics.sample(5)

In [None]:
df_cmu_imdb_name_basics.to_csv('data/cmu_tmdb_actor.csv', index=False)

## Exploratory Data Analysis

## Research questions

### 1. What metrics (e.g., low ratings, limited number of ratings, revenue vs budget) best indicate movie failure?

### 2. How do actor demographics and lack of diversity impact audience disengagement and contribute to box office underperformance?

### 3. What role do director-actor collaborations play in a movie’s failure, and are there specific patterns in these partnerships that correlate with unsuccessful films?

### 4. Is thematic consistency in director filmographies a predictor of failure/success?

### 5. How do overused or poorly executed character tropes contribute to a movie’s box office failure?

### 6. How does genre choice influence a movie’s failure, particularly in different cultural contexts?

### 7. How does poor release timing (e.g., season, holiday periods) affect a movie's likelihood of failing?

### 8. How has the thematic content of movie plots evolved, and what themes have historically failed to resonate with audiences?

### 9. How does portraying controversial social issues or outdated themes affect a movie’s acceptance and potential failure across demographics?
