In [1]:
import pandas as pd
import helpers
import json
import string
import numpy as np
from collections import Counter
from scipy import stats
import statsmodels.api as sm
import tropes

We have two datasets: one from TVTropes and the CMU one. Since the TVTropes dataset only has titles, we can only use the titles to merge with the CMU dataset.

In both datasets, we drop all movies that have duplicated titles (after some preprocessing on the title text). This means that all occurrences of such movies are filtered out.

In [2]:
df_tvtropes = tropes.get_tvtropes_movies("../data/tvtropes_20200302.json")
df_cmu = helpers.get_movies()
df_merged = tropes.merge_cmu_with_tvtropes(df_cmu, df_tvtropes)
df_merged = tropes.add_bob_indicator(df_merged)

In [3]:
all_tropes = [tr for tropes in df_merged.trope.values for tr in tropes if tr != "boxofficebomb"]
all_tropes_bombs = [tr for tropes in df_merged.trope.values for tr in tropes if "boxofficebomb" in tropes and tr != "boxofficebomb"]

In [4]:
counts = Counter(all_tropes)
counts_bombs = Counter(all_tropes_bombs)

In [5]:
def normalize_counts(counts, normalizing_const=None):
    if normalizing_const is None:
        normalizing_const = sum(counts.values(), 0.0)
    for key in counts:
        counts[key] /= normalizing_const
        counts[key] *= 100
        # counts[key] = np.round(counts[key], 3)
    return counts

In [6]:
counts = normalize_counts(counts, df_merged.shape[0])
counts_bombs = normalize_counts(counts_bombs, df_merged["is_bob"].sum())

In [7]:
res_1 = counts.most_common(100)
res1 = {}
for k, v in res_1:
    res1[k] = v

In [8]:
res_2 = counts_bombs.most_common(100)
res2 = {}
for k, v in res_2:
    res2[k] = v

In [9]:
tropes_feat = list(res1.keys())
trope_ind_mat = np.zeros((len(df_merged), len(tropes_feat)))

for i, feat in enumerate(tropes_feat):
    all_ind = []
    for v in df_merged["trope"].values:
        all_ind.append(feat in v)
    trope_ind_mat[:, i] = all_ind
trope_ind_mat = pd.DataFrame(trope_ind_mat, columns=tropes_feat)

In [10]:
model = sm.Logit(df_merged["is_bob"].values, sm.add_constant(trope_ind_mat))
res = model.fit()

Optimization terminated successfully.
         Current function value: 0.369822
         Iterations 7


In [11]:
res.params[res.pvalues[res.pvalues < 0.05/100].index].sort_values(ascending=False)

filmsof20102014             1.620407
filmsof19951999             1.604104
filmsof19901994             1.395992
filmsof20002004             1.338109
filmsof20052009             1.209386
americanfilms               1.055539
starderailingrole           1.038481
filmsofthe1980s             0.983603
creatorkiller               0.925786
filmsdiscussedbymoviebob    0.766262
troubledproduction          0.640401
playingagainsttype          0.498583
thedragon                   0.475164
horrorfilms                -0.924672
const                      -3.653670
dtype: float64

In [17]:
df_sentiments = pd.read_csv("../data/MovieSummaries/plot_sentiments.csv")

In [26]:
df2 = pd.merge(df_sentiments, df_merged, on="wiki_id")
# df2["label"] = df2["label"].replace({"POSITIVE": 1, "NEGATIVE": -1})

In [32]:
df2.groupby("label")["wiki_id"].count()

label
NEGATIVE    4465
POSITIVE    1646
Name: wiki_id, dtype: int64

In [23]:
model2 = sm.OLS(df2["is_bob"].values, sm.add_constant(df2[["score", "label"]]))
res2 = model2.fit()
res2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.191
Date:,"Fri, 18 Nov 2022",Prob (F-statistic):,0.826
Time:,18:26:48,Log-Likelihood:,-3061.7
No. Observations:,6111,AIC:,6129.0
Df Residuals:,6108,BIC:,6150.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1749,0.043,4.077,0.000,0.091,0.259
score,0.0256,0.046,0.554,0.580,-0.065,0.116
label,-0.0009,0.006,-0.156,0.876,-0.012,0.011

0,1,2,3
Omnibus:,1298.123,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2330.162
Skew:,1.507,Prob(JB):,0.0
Kurtosis:,3.27,Cond. No.,18.3
