This is for research question 3.

What role do key personnel (actors and directors) play in shaping a movie's ending?

Do certain actors or directors have a preference for particular types of endings, and do their choices influence the overall predictability of a movie’s outcome?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import plotly
import plotly.express as px
import plotly.graph_objects as go


# path
DATA_FOLDER = '../data/'
MOVIE_DATASET = DATA_FOLDER + 'movies_dataset_final.tsv'

# Dataset loading
movies = pd.read_csv(MOVIE_DATASET, sep='\t')


In [2]:
movies.head()

Unnamed: 0,Movie_ID,Other_Column,Title,Movie release date,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Score,director,vote_average,revenue,collection,Budget,Production
0,975900,/m/03vyhn,Ghosts of Mars,2001,98.0,English,United States of America,"Thriller, Science Fiction, Horror, Adventure, ...","cre on him, he handcuffs Ballard to her cot an...",0.0,John Carpenter,5.112,14010832.0,,28000000.0,"[{'id': 51312, 'logo_path': None, 'name': 'Ani..."
1,9363483,/m/0285_cd,White Of The Eye,1987,110.0,English,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",sive vest. Mike opens fire on him with a machi...,-0.12,Donald Cammell,5.8,0.0,,0.0,"[{'id': 26932, 'logo_path': None, 'name': ""Mrs..."
2,261236,/m/01mrr1,A Woman in Flames,1983,106.0,German,Germany,Drama,"k out on him, and he gets angry, throws her ag...",0.06,Robert van Ackeren,5.4,0.0,,0.0,"[{'id': 1766, 'logo_path': None, 'name': 'Diet..."
3,6631279,/m/0gffwj,Little city,1997,93.0,English,United States of America,"Romantic comedy, Ensemble, Comedy-drama, Drama...",a soon tires of Anne and breaks off their affa...,0.144444,Hervé Palud,5.978,0.0,,0.0,"[{'id': 311, 'logo_path': '/TNGvZ8zOklntjPP3Ec..."
4,77856,/m/0kcn7,Mary Poppins,1964,139.0,English,United States of America,"Children's/Family, Musical, Fantasy, Comedy, D...","window. In the park with other kite-flyers, Mr...",0.33125,Robert Stevenson,7.6,103100000.0,527439.0,4650000.0,"[{'id': 3166, 'logo_path': '/vyyv4Gy9nPqAZKElP..."


In [3]:
# drop movies without director, vote average or revenue
movies_filtered = movies.dropna(subset=['vote_average', 'revenue', 'director'])

# drop movies with revenue less than 1000
movies_filtered = movies_filtered[movies_filtered['revenue'] > 1000]

In [4]:
# count the number of movies per director
director_counts = movies_filtered['director'].value_counts()

# filter out directors with more than 4 movies
directors = director_counts[director_counts > 4].index
movies_filtered = movies_filtered[movies_filtered['director'].isin(directors)]

# count the number of movies per director
director_counts_filtered = movies_filtered['director'].value_counts()
director_counts_filtered

director
Clint Eastwood        29
Steven Spielberg      26
Joel Schumacher       26
Alfred Hitchcock      24
Ridley Scott          23
                      ..
Walter Salles          5
Roger Spottiswoode     5
Ben Affleck            5
Jake Kasdan            5
Mark Dindal            5
Name: count, Length: 357, dtype: int64

In [5]:
# calculate the average score per director, and sort the directors by score
director_avg_score = movies_filtered.groupby('director')['Score'].mean()
director_avg_score = director_avg_score.sort_values(ascending=False)
director_avg_score

director
Bobby Farrelly         0.275245
Adam Shankman          0.265229
Brian Robbins          0.261244
Mark Steven Johnson    0.257407
Frank Coraci           0.254440
                         ...   
Simon West            -0.129147
Fernando Meirelles    -0.158322
Mark Dindal           -0.159286
Jun Fukuda            -0.198542
Kathryn Bigelow       -0.241281
Name: Score, Length: 357, dtype: float64

In [6]:


fig = px.histogram(
    director_avg_score, 
    nbins=10,
    title='Average ending score per director',
    log_y=True
)

fig.update_layout(
    xaxis_title='Average ending score',
    yaxis_title='Number of directors',
    bargap=0.1,
    showlegend=False
)

fig.show()

fig.write_html("../../assets/img/rq3/director_avg_score.html")

In [7]:
# calculate the score variance per director, and sort the directors by variance
director_score_variance = movies_filtered.groupby('director')['Score'].std()
director_score_variance = director_score_variance.sort_values(ascending=False)
director_score_variance

director
Mark Dindal             0.422340
Michael Winterbottom    0.396467
David Lean              0.388405
Jon Amiel               0.340166
David O. Russell        0.332615
                          ...   
Roger Spottiswoode      0.051187
Giuseppe Tornatore      0.048734
Breck Eisner            0.048668
Joe Wright              0.044512
Ken Loach               0.030679
Name: Score, Length: 357, dtype: float64

In [8]:


fig = px.histogram(
    director_score_variance, 
    nbins=10,
    title='Ending score variance per director',
    log_y=True
)

fig.update_layout(
    xaxis_title='Ending score variance',
    yaxis_title='Number of directors',
    bargap=0.1,
    showlegend=False
)

fig.show()

fig.write_html("../../assets/img/rq3/director_score_variance.html")

In [9]:
# calculate the average revenue per director, and sort the directors by revenue
director_avg_revenue = movies_filtered.groupby('director')['revenue'].mean()
director_avg_revenue = director_avg_revenue.sort_values(ascending=False)
director_avg_revenue

director
James Cameron         1.323094e+09
Peter Jackson         5.944917e+08
Bill Condon           5.878624e+08
Jon Favreau           5.865651e+08
George Lucas          5.569251e+08
                          ...     
Mark Sandrich         2.635833e+06
William A. Wellman    2.253400e+06
Jun Fukuda            2.248213e+06
Akira Kurosawa        1.362650e+06
Luis Buñuel           1.600054e+05
Name: revenue, Length: 357, dtype: float64

In [10]:


fig = px.scatter(
    pd.DataFrame({
    'Average Ending Score': director_avg_score,
    'Ending Score Variance': director_score_variance,
    'Average Revenue': director_avg_revenue
}),
    x='Average Ending Score',
    y='Ending Score Variance',
    color=np.log10(director_avg_revenue),
    hover_name=director_avg_score.index,
    color_continuous_scale='Viridis',
    title='Average ending score vs ending score variance per director<br>Color is average revenue'
)

fig.update_layout(
    xaxis_title='Average Ending Score',
    yaxis_title='Ending Score Variance',
    coloraxis_colorbar=dict(title='Average Revenue', tickvals=[6, 7, 8, 9], ticktext=['1M', '10M', '100M', '1B'])
)

fig.show()

fig.write_html("../../assets/img/rq3/director_avg_score_vs_score_variance.html")

In [11]:
# calculate the average vote average per director, and sort the directors by vote average
director_avg_vote_average = movies_filtered.groupby('director')['vote_average'].mean()
director_avg_vote_average = director_avg_vote_average.sort_values(ascending=False)
director_avg_vote_average

director
Sergio Leone          8.247000
Akira Kurosawa        8.116200
Stanley Kubrick       7.997889
Hayao Miyazaki        7.925727
Charlie Chaplin       7.871429
                        ...   
Brian Levant          5.667556
Andrzej Bartkowiak    5.505200
Brian Robbins         5.496125
Aaron Norris          5.161600
Uwe Boll              3.807000
Name: vote_average, Length: 357, dtype: float64

In [12]:


fig = px.scatter(
    pd.DataFrame({
    'Average Ending Score': director_avg_score,
    'Ending Score Variance': director_score_variance,
    'Average Vote Average': director_avg_vote_average
}),
    x='Average Ending Score',
    y='Ending Score Variance',
    color='Average Vote Average',
    hover_name=director_avg_score.index,
    color_continuous_scale='Viridis',
    title='Average ending score vs ending score variance per director<br>Color is average vote average'
)

fig.update_layout(
    xaxis_title='Average Ending Score',
    yaxis_title='Ending Score Variance',
    coloraxis_colorbar=dict(title='Average Vote Average')
)

fig.show()

fig.write_html("../../assets/img/rq3/director_avg_score_vs_score_variance_vote_average.html")

In [13]:
# use t test to check if the average score is significantly different between directors with high and low revenue
from scipy.stats import ttest_ind

high_revenue_directors = director_avg_revenue[director_avg_revenue > 0.1e9].index
low_revenue_directors = director_avg_revenue[director_avg_revenue < 0.1e9].index

print('Number of high revenue directors:', len(high_revenue_directors))
print('Number of low revenue directors:', len(low_revenue_directors))

high_revenue_director_avg_score = director_avg_score[high_revenue_directors].dropna()
low_revenue_director_avg_score = director_avg_score[low_revenue_directors].dropna()

t_stat, p_value = ttest_ind(high_revenue_director_avg_score, low_revenue_director_avg_score)
print('t stat:', t_stat)
print('p value:', p_value)

if p_value < 0.05:
    print('The average score is significantly different between directors with high and low revenue')
else:
    print('The average score is not significantly different between directors with high and low revenue')


Number of high revenue directors: 113
Number of low revenue directors: 244
t stat: 2.7974272535329923
p value: 0.005431923389924986
The average score is significantly different between directors with high and low revenue


In [14]:
# use t test to check if the average score is significantly different between directors with high and low vote average

high_vote_average_directors = director_avg_vote_average[director_avg_vote_average > 6.5].index
low_vote_average_directors = director_avg_vote_average[director_avg_vote_average < 6.5].index

print('Number of high vote average directors:', len(high_vote_average_directors))
print('Number of low vote average directors:', len(low_vote_average_directors))

high_vote_average_director_avg_score = director_avg_score[high_vote_average_directors].dropna()
low_vote_average_director_avg_score = director_avg_score[low_vote_average_directors].dropna()

t_stat, p_value = ttest_ind(high_vote_average_director_avg_score, low_vote_average_director_avg_score)
print('t stat:', t_stat)
print('p value:', p_value)

if p_value < 0.05:
    print('The average score is significantly different between directors with high and low vote average')
else:
    print('The average score is not significantly different between directors with high and low vote average')

Number of high vote average directors: 199
Number of low vote average directors: 158
t stat: -2.521378570186223
p value: 0.01212676091338043
The average score is significantly different between directors with high and low vote average


In [15]:
# use t test to check if the score variance is significantly different between directors with high and low revenue

high_revenue_director_score_variance = director_score_variance[high_revenue_directors].dropna()
low_revenue_director_score_variance = director_score_variance[low_revenue_directors].dropna()

print('Number of high revenue directors:', len(high_revenue_directors))
print('Number of low revenue directors:', len(low_revenue_directors))

t_stat, p_value = ttest_ind(high_revenue_director_score_variance, low_revenue_director_score_variance)
print('t stat:', t_stat)
print('p value:', p_value)

if p_value < 0.05:
    print('The score variance is significantly different between directors with high and low revenue')
else:
    print('The score variance is not significantly different between directors with high and low revenue')

Number of high revenue directors: 113
Number of low revenue directors: 244
t stat: 0.2546688080989438
p value: 0.7991263695144881
The score variance is not significantly different between directors with high and low revenue


In [16]:
# use t test to check if the score variance is significantly different between directors with high and low vote average

high_vote_average_director_score_variance = director_score_variance[high_vote_average_directors].dropna()
low_vote_average_director_score_variance = director_score_variance[low_vote_average_directors].dropna()

print('Number of high vote average directors:', len(high_vote_average_directors))
print('Number of low vote average directors:', len(low_vote_average_directors))

t_stat, p_value = ttest_ind(high_vote_average_director_score_variance, low_vote_average_director_score_variance)
print('t stat:', t_stat)
print('p value:', p_value)

if p_value < 0.05:
    print('The score variance is significantly different between directors with high and low vote average')
else:
    print('The score variance is not significantly different between directors with high and low vote average')

Number of high vote average directors: 199
Number of low vote average directors: 158
t stat: -0.85848578613168
p value: 0.3912035367557568
The score variance is not significantly different between directors with high and low vote average
