In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime as dt
from concurrent.futures import ThreadPoolExecutor, wait
%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 6)

words_df = pd.read_csv("../data/words.csv")
movies = pd.read_csv("../data/cleaned_movie_data.csv")

In [8]:
def run_word_hypoth_test(word, n=1000):
    not_nan_plots = movies.dropna(subset=["overview"])
    not_nan_plots["has_word"] = not_nan_plots["overview"].str.contains(word)
    stats = []
    actual = abs(not_nan_plots[not_nan_plots["has_word"]]["rating"].mean() - \
                 not_nan_plots[~(not_nan_plots["has_word"])]["rating"].mean())
    for _ in range(n):
        has_word = not_nan_plots["has_word"].values
        np.random.shuffle(has_word)
        not_nan_plots["has_word_shuffled"] = has_word
        has_mean = not_nan_plots[not_nan_plots["has_word_shuffled"]]["rating"].mean()
        not_mean = not_nan_plots[~(not_nan_plots["has_word_shuffled"])]["rating"].mean()
        stats.append(abs(has_mean - not_mean))

    p_value = np.count_nonzero(np.array(stats) >= actual) / n
    return word, p_value

In [9]:
words_pool = ThreadPoolExecutor()
words_futures = []
for _, row in words_df.iterrows():
    words_futures.append(words_pool.submit(run_word_hypoth_test, row["word"]))

words_results = wait(words_futures)[0]
p_values = {tup.result()[0] : tup.result()[1] for tup in words_results}

In [11]:
filtered_p_values = {}
for k in p_values:
    if p_values[k] <= .05:
        filtered_p_values[k] = p_values[k]
        
signif_words = pd.DataFrame({"word": list(filtered_p_values.keys()), "p_value": list(filtered_p_values.values())})
print(signif_words.shape)
signif_words.head()

(110, 2)


Unnamed: 0,p_value,word
0,0.001,this
1,0.006,after
2,0.003,known
3,0.0,side
4,0.001,killer


In [12]:
signif_words.to_csv("../data/signficant_words.csv", index=False)

add these as features

In [14]:
for word in signif_words["word"]:
    has_word = movies["overview"].str.contains(word).astype(int)
    movies[word] = has_word

In [16]:
print(movies.shape)
movies.head()

(2459, 149)


Unnamed: 0,adult,backdrop_path,id,original_language,original_title,overview,popularity,poster_path,release_date,title,...,music,which,soon,know,them,wing,comedy,form,test,mall
0,0,/bOGkgRGdhrBYJSLpXaxhXVstddV.jpg,299536,en,Avengers: Infinity War,as the avengers and their allies have continue...,153.811,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,2018-04-25,Avengers: Infinity War,...,0,0,0,0,1,0,0,0,0,0
1,0,/5zfVNTrkhMu673zma6qhFzG01ig.jpg,300668,en,Annihilation,a biologist signs up for a dangerous secret e...,29.516,/d3qcpfNwbAMCNqWDHzPQsUYiUgS.jpg,2018-02-22,Annihilation,...,0,0,0,0,0,0,0,0,0,0
2,0,/zjG95oDnBcFKMPgBEmmuNVOMC90.jpg,299782,en,The Other Side of the Wind,surrounded by fans and skeptics grizzled dire...,6.82,/kFky1paYEfHxfCYByEc9g7gn6Zk.jpg,2018-11-02,The Other Side of the Wind,...,0,0,0,0,0,0,0,0,0,0
3,0,/q9hnJ9SzwcF30seRtXEzLd5l1gw.jpg,351044,en,Welcome to Marwen,when a devastating attack shatters mark hoganc...,61.973,/o45VIAUYDcVCGuzd43l8Sr5Dfti.jpg,2018-12-21,Welcome to Marwen,...,0,0,0,1,0,0,0,0,1,0
4,0,/AmO8I38bkHwKhgxPNrd6djBQyPU.jpg,361292,en,Suspiria,a darkness swirls at the center of a world ren...,41.461,/dzWTnkert9EoiPWldWJ15dnfAFl.jpg,2018-10-11,Suspiria,...,0,0,0,0,0,0,0,0,0,0


In [17]:
movies.to_csv("../data/movies_with_words.csv", index=False)