In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from scipy import stats
import matplotlib.pyplot as plt

In [4]:
xls = pd.ExcelFile("rotten_tomatoes_movies.xlsx")

In [7]:
dfs = pd.read_excel(xls, 'shyamalan')[["movie_title", "directors", "original_release_date", "tomatometer_rating", "audience_rating"]]
dft = pd.read_excel(xls, 'tarantino')[["movie_title", "directors", "original_release_date", "tomatometer_rating", "audience_rating"]]

dfs

Unnamed: 0,movie_title,directors,original_release_date,tomatometer_rating,audience_rating
0,The Happening,M. Night Shyamalan,2008-06-11,17,24
1,Wide Awake,M. Night Shyamalan,1998-03-20,44,67
2,After Earth,M. Night Shyamalan,2013-05-31,11,36
3,Glass,M. Night Shyamalan,2019-01-18,37,67
4,Lady in the Water,M. Night Shyamalan,2006-07-21,25,49
5,The Last Airbender,M. Night Shyamalan,2010-07-01,5,30
6,Signs,M. Night Shyamalan,2002-08-02,74,67
7,The Sixth Sense,M. Night Shyamalan,1999-08-06,86,90
8,Split,M. Night Shyamalan,2017-01-20,77,79
9,The Visit,M. Night Shyamalan,2015-09-11,68,51


In [8]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=dfs["tomatometer_rating"], name = "Shyamalan"), row=1, col=1)
fig.add_trace(go.Histogram(x=dft["tomatometer_rating"], name = "Tarantino"), row=1, col=2)

fig.update_xaxes(title="Tomatometer", row=1, col=1)
fig.update_xaxes(title="Tomatometer", row=1, col=2)

fig.update_layout(title_text="Histogram Comparison of Tomatometer of Shyamalan and Tarantino Films")

fig.show()

In [9]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Box(x = dfs["tomatometer_rating"], name = "Shyamalan"), row=1, col=1)
fig.add_trace(go.Box(x = dft["tomatometer_rating"], name = "Tarantino"), row=1, col=1)
fig.update_xaxes(title = "Tomatometer")
fig.update_layout(title = "Boxplot Comparison of Ratings of Movies")
fig.show()

In [10]:
print("Shyamalan:")
print("Minimum: " + str(min(dfs["tomatometer_rating"])))
print("Q1: " + str(np.quantile(dfs["tomatometer_rating"], 0.25)))
print("Median: " + str(np.median(dfs["tomatometer_rating"])))
print("Q3: " + str(np.quantile(dfs["tomatometer_rating"], 0.75)))
print("Maximum: " + str(max(dfs["tomatometer_rating"])))
print("IQR: " + str(np.quantile(dfs["tomatometer_rating"], 0.75) - np.quantile(dfs["tomatometer_rating"], 0.25)))
print()
print("Mean: " + str(round(np.mean(dfs["tomatometer_rating"]),2)))
print("Standard Deviation: " + str(round(np.std(dfs["tomatometer_rating"]),2)))
print("Variance: " + str(round(np.std(dfs["tomatometer_rating"])**2,2)))
print()

print("Tarantino:")
print("Minimum: " + str(min(dft["tomatometer_rating"])))
print("Q1: " + str(np.quantile(dft["tomatometer_rating"], 0.25)))
print("Median: " + str(np.median(dft["tomatometer_rating"])))
print("Q3: " + str(np.quantile(dft["tomatometer_rating"], 0.75)))
print("Maximum: " + str(max(dft["tomatometer_rating"])))
print("IQR: " + str(np.quantile(dft["tomatometer_rating"], 0.75) - np.quantile(dft["tomatometer_rating"], 0.25)))
print()
print("Mean: " + str(round(np.mean(dft["tomatometer_rating"]),2)))
print("Standard Deviation: " + str(round(np.std(dft["tomatometer_rating"]),2)))
print("Variance: " + str(round(np.std(dft["tomatometer_rating"])**2,2)))

Shyamalan:
Minimum: 5
Q1: 23.0
Median: 43.5
Q3: 71.0
Maximum: 86
IQR: 48.0

Mean: 46.42
Standard Deviation: 26.9
Variance: 723.74

Tarantino:
Minimum: 13
Q1: 75.0
Median: 85.0
Q3: 87.0
Maximum: 92
IQR: 12.0

Mean: 77.23
Standard Deviation: 20.09
Variance: 403.72


In [31]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("Shyamalan Scatterplot", "Tarantino Scatterplot", "Tarantino w/o Outlier"))
x = dfs["tomatometer_rating"]
y = dfs["audience_rating"]
fig.add_trace(go.Scatter(x = x, y = y, mode = "markers", name = "Shyamalan Scatter"), row=1, col=1)
ress = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = x * ress.slope + ress.intercept, mode = "lines", name = "Shyamalan LSRL"), row=1, col=1)
print(ress.slope, ress.intercept)

x = dft["tomatometer_rating"]
y = dft["audience_rating"]
fig.add_trace(go.Scatter(x = x, y = y, mode = "markers", name = "Tarantino Scatter"), row=1, col=2)
rest = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = x * rest.slope + rest.intercept, mode = "lines", name = "Tarantino LSRL"), row=1, col=2)
print(rest.slope, rest.intercept)

x = list(dft["tomatometer_rating"])
y = list(dft["audience_rating"])
x.sort()
y.sort()
x = np.asarray(x[1:])
y = np.asarray(y[1:])
fig.add_trace(go.Scatter(x = x, y = y, mode = "markers", name = "Tarantino w/o Outliers Scatter"), row=1, col=3)
rest = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = x * rest.slope + rest.intercept, mode = "lines", name = "Tarantino w/o Outliers LSRL"), row=1, col=3)
print(rest.slope, rest.intercept)

fig.update_xaxes(title = "Tomatometer Rating")
fig.update_yaxes(title = "Audience Rating")
fig.update_layout(width = 1000, title = "Tomatometer v Audience Rating Scatterplots")

fig.show()

0.631305232251317 28.530248803001374
0.28885501553614346 59.69150495397784
0.951856210548839 4.475874612175048


In [34]:
fig = make_subplots(rows=1, cols=3, subplot_titles=("Shyamalan Residuals", "Tarantino Residuals", "Tarantino w/o Outlier"))
x = dfs["tomatometer_rating"]
y = dfs["audience_rating"]
ress = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = y - (x * ress.slope + ress.intercept), mode = "markers", name = "Shyamalan Residuals"), row=1, col=1)
fig.add_trace(go.Scatter(x = x, y = np.zeros(len(x)), mode = "lines", name = "Shyamalan Zero Line"), row=1, col=1)

print(ress.slope, ress.intercept)

x = dft["tomatometer_rating"]
y = dft["audience_rating"]
rest = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = y - (x * rest.slope + rest.intercept), mode = "markers", name = "Tarantino Residuals"), row=1, col=2)
fig.add_trace(go.Scatter(x = x, y = np.zeros(len(x)), mode = "lines", name = "Tarantino Zero Line"), row=1, col=2)

print(rest.slope, rest.intercept)

x = list(dft["tomatometer_rating"])
y = list(dft["audience_rating"])
x.sort()
y.sort()
x = np.asarray(x[1:])
y = np.asarray(y[1:])
rest = stats.linregress(x, y)
fig.add_trace(go.Scatter(x = x, y = y - (x * rest.slope + rest.intercept), mode = "markers", name = "Tarantino w/o Outliers Residuals"), row=1, col=3)
fig.add_trace(go.Scatter(x = x, y = np.zeros(len(x)), mode = "lines", name = "Tarantino w/o Outliers Zero Line"), row=1, col=3)

print(rest.slope, rest.intercept)

fig.update_xaxes(title = "Tomatometer Rating")
fig.update_yaxes(title = "Audience Rating")
fig.update_layout(width = 1500, title = "Tomatometer v Audience Rating LSRL Residuals")

fig.show()

0.631305232251317 28.530248803001374
0.28885501553614346 59.69150495397784
0.951856210548839 4.475874612175048


In [36]:
x = dfs["tomatometer_rating"]
y = dfs["audience_rating"]
ress = stats.linregress(x, y)
print(ress.slope, ress.intercept)
print(sum(np.square(y - np.mean(y))))
print(sum(np.square(y - (x * ress.slope + ress.intercept))))
print()

x = dft["tomatometer_rating"]
y = dft["audience_rating"]
rest = stats.linregress(x, y)
print(rest.slope, rest.intercept)
print(sum(np.square(y - np.mean(y))))
print(sum(np.square(y - (x * rest.slope + rest.intercept))))
print()

x = list(dft["tomatometer_rating"])
y = list(dft["audience_rating"])
x.sort()
y.sort()
x = np.asarray(x[1:])
y = np.asarray(y[1:])
rest = stats.linregress(x, y)
print(rest.slope, rest.intercept)
print(sum(np.square(y - np.mean(y))))
print(sum(np.square(y - (x * rest.slope + rest.intercept))))

0.631305232251317 28.530248803001374
4623.666666666667
1162.325295771404

0.28885501553614346 59.69150495397784
1006.0
568.0957964472059

0.951856210548839 4.475874612175048
822.9166666666667
117.19460789558144
