In [5]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from collections import Counter
import pandas as pd
import warnings
import json
import os
import plotly.figure_factory as ff
from dash import Dash, dcc, html, callback, Input, Output

import plotly.express as px
import chart_studio.plotly as py

# Allow python to import modules from the preprocessing folder
sys.path.append("src/preprocessing/")
warnings.filterwarnings("ignore") # For seaborn

from load_dataset import *

SAVE_PATH = "./data/refined"
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)

In [6]:
years = range(1960, 2012)
movies_df = load_movies_df()

In [7]:
movies_df['movie_release_year'] = movies_df[~movies_df['movie_release_date'].isna()]['movie_release_date'].astype(str).str[:4].astype(int)
wrong_dates = movies_df[movies_df['movie_release_year'] < 1800].index
movies_df.loc[wrong_dates]

# A movie that has the release date 1010. We fix the typo to 2010
movies_df.loc[wrong_dates, 'movie_release_year']= 2010
movies_df.loc[wrong_dates, 'movie_release_date'] ='2010-12-02'

In [8]:
groupped = movies_df.groupby("movie_release_year").sum().reset_index()
groupped = groupped.loc[(groupped["movie_release_year"] > 1960) & (groupped["movie_release_year"] < 2012)]

In [9]:
GDP_PATH = "data/gdp/gdp_data.csv"
gdp_df = pd.read_csv(GDP_PATH)

In [10]:
iceland_gdp = gdp_df[gdp_df["Country Name"] == "Iceland"]

gdp_list = []
for year in years:
    gdp_list.append(iceland_gdp[str(year)].iloc[0])
    
iceland_gdp = pd.DataFrame.from_dict({"year" : years, "iceland_gdp" : gdp_list})

In [11]:
groupped = groupped.merge(iceland_gdp, left_on="movie_release_year", right_on="year")
groupped = groupped.rename(columns={"box_office" : "Box-office", "iceland_gdp" : "Iceland GDP"})
groupped = groupped.drop(columns=["wiki_movie_id", "movie_runtime", "movie_release_year"])

In [16]:
melted = pd.melt(groupped, ["year"])
melted = melted.rename(columns={"value" : "Revenue (in $)"})
fig = px.line(melted, x="year", y="Revenue (in $)", color='variable')
fig.update_layout(title_text="Comparison between Iceland GDP and movies' box office each year", title_x=0.5)
fig.write_html('plot/box_office_vs_gdp.html', auto_open=True)

In [17]:
imdb_ratings_df = load_imdb_ratings()

In [18]:
imdb_ratings_df

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2004
1,tt0000002,5.8,269
2,tt0000003,6.5,1902
3,tt0000004,5.5,178
4,tt0000005,6.2,2685
...,...,...,...
1365863,tt9916730,7.6,11
1365864,tt9916766,7.0,22
1365865,tt9916778,7.2,36
1365866,tt9916840,8.8,6


In [19]:
fig = px.histogram(imdb_ratings_df, x="averageRating", labels={"averageRating" : "Rating"}, nbins=100)
fig.update_layout(title_text="Distribution of ratings on IMDb", title_x=0.5)
fig.write_html('plot/ratings_histogram.html', auto_open=True)

In [20]:
# ratings_copy = imdb_ratings_df.rename(columns={"numVotes" : "Number of votes"})
fig = fig = px.histogram(imdb_ratings_df, x="numVotes", 
                         labels={"numVotes" : "Number of votes"}, 
                         log_y=True, nbins=100)
fig.update_layout(title_text="Number of movies over number of votes", title_x=0.5)
fig.write_html('plot/movie_votes.html', auto_open=True)

## Personas terms

In [28]:
import json
import gensim.downloader
from sklearn.manifold import TSNE

In [22]:
with open("data/cmu/personas/persona_verbs.json") as f:
    frequent_terms = json.load(f)

In [25]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [26]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [27]:
# nice_personas = {4, 5, 40, 43}

cluster_points = dict()
word_vectors = []
word_personas = []
word_labels = []

for p in range(1, 51):
    words = frequent_terms[str(p)]

    for w in words:
        word_vectors.append(glove_vectors[w])
        word_personas.append(str(p))
        word_labels.append(w)

In [29]:
tsne = TSNE(n_components=2, learning_rate='auto', random_state=10,
                   init='random', perplexity=3, metric='cosine').fit_transform(np.array(word_vectors))

In [30]:
df = pd.DataFrame.from_dict({'tsne_1' : tsne[:,0], 'tsne_2' : tsne[:,1], 'persona' : word_personas, 'label' : word_labels})

In [33]:
nice_personas = {"14", "5", "27", "40", "43"}

fig = px.scatter(df[df["persona"].isin(nice_personas)], x="tsne_1", y="tsne_2", color="persona",
                 hover_data=['label'],
                labels={"tsne_1" : "Dimension 1 of TSNE", "tsne_2" : "Dimension 2 of TSNE", "persona" : "Persona ID"})

fig.update_layout(title_text="Word2Vec representations (reduced by TSNE) <br> of the most frequent words for some personas", title_x=0.5)
fig.update_traces(marker=dict(size=10))
fig.write_html('plot/persona_scatter.html', auto_open=True)