In [None]:
import pandas as pd
from src.utils.initial_plots_data import plot_movies_over_time_months, plot_movies_over_time_years, plot_pie_genres_proportion, plot_donut_top_countries, show_top_10_words_per_emotion
from src.data.load_final_data import load_final_movies_and_reviews
from src.data.data_wrangling import load_and_clean_movies_df
from src.data.load_data_reviews_kaggle import load_movie_reviews_kaggle
from src.data.load_data_non_commercial_imdb import load_imdb_id_wikipedia_id, load_imdb_average_reviews
from src.data.normalize_emotions import normalize_total_plot_emotions, normalize_review_emotions
from src.scripts.scrap_date import scrap_years_months_movies, get_final_dates, load_scrapped_dates, save_final_dates
from src.scripts.scrap_reviews import scrape_reviews
from src.models.predict_emotions import predict_emotions_to_tsv, merge_df_with_emotions_tsv, predict_emotions_to_tsv, merge_df_with_emotions_tsv
from src.data.merge_genres import *
from src.utils.plot_correlations import *
from src.utils.initial_plots_data import *
from src.data.load_data import load_character_metadata

# If you have any problem with NLTK restart your kernel and run this:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

from scipy.stats import spearmanr, pearsonr

from src.utils.plot_3D_actor_emotions import plot_3D_actor_plot_emotion, plot_3D_actor_review_emotion
from src.utils.plot_bars_actors_emotions import plot_actors_emotion_selector
from src.models.kmeans_emotional_type import get_best_k_clustered_movie_emotional_type
from src.utils.plot_movie_emotional_type import plot_clustered_movie_emotional_type, plot_emotions_centroids
from src.data.actor_name_statistics import get_actors_name_and_statistics
from src.utils.plot_genres import plot_emotion_distribution
from src.utils.plot_ratings import emotion_distribution_by_movie_rating
from src.scripts.emotion_evolution import *
from src.models.predict_emotions import read_tsv_predicted_emotions
from src.scripts.emotion_transitions import *
from src.utils.plot_countries_plots import plot_world_map_emotion_by_genre, plot_world_map_average_rating
from src.utils.plot_ratings_by_dominant_emotion import plot_ratings_by_most_dominant_emotion
from src.utils.plot_top_words import generate_word_clouds_by_emotion
from src.scripts.emotion_transitions import *
from src.scripts.emotion_evolution import *
from src.utils.plots_month_trends import*
from src.utils.plots_genres import*

# Make sure that if any dependencies changes it will be reflected in the notebook (From the ML course)
%load_ext autoreload
%autoreload 2

### Scripts to create and process our datasets 
(We do not recommand running theses functions. We ran some of them for multiple days):

Load movie metadata:

`df_movies_metadata, df_movies_language, df_movies_countries, df_movies_genres = load_and_clean_movies_df()`

Load movie reviews from kaggle:

`df_reviews_kaggle = load_movie_reviews_kaggle(df_movies_metadata)`

Load mapping IMDB id to wikipedia ID:

`df_mapping_imdb_id_wikipedia_id = load_imdb_id_wikipedia_id(df_movies_metadata)`

Load movie average reviews from the non-commercial IMDB:

`df_imdb_average_reviews = load_imdb_average_reviews(df_mapping_imdb_id_wikipedia_id)`

### Predict emotions

Compute plot emotions:

`predict_emotions_to_tsv(df_movies, column='plot', file_name='plot_emotions.tsv')`

Merging df_movies with plot_emotions.tsv:

`df_movies_with_emotions = merge_df_with_emotions_tsv(df_movies, file_name='plot_emotions.tsv', prefix='plot')`

Compute review emotions:
`predict_emotions_to_tsv(df_reviews, column='review_detail', file_name='review_emotions.tsv, is_review=True')`

Merging df_reviews with review_emotions.tsv:

`df_reviews_with_emotions = merge_df_with_emotions_tsv(df_reviews, file_name='review_emotions.tsv', prefix='review', is_review=True)`

### Scrap more years and months:
We scrap our years and months data using:

`scrap_years_months_movies(df_movies_metadata)`

Then we load the resulting csv and combine both data (from both the original dataset and scrapped data):

`save_final_dates(get_final_dates(load_scrapped_dates()))`

### Scrap more reviews:

`scrape_reviews(df_mapping_imdb_id_wikipedia_id)`

### Scripts to normalize emotion scores
We normalize the plot emotion scores using: 

`df_movies_with_emotions_normalized = normalize_total_plot_emotions(df_movies_with_emotions, with_neutral=False)`

We normalize the review emotion scores using: 

`df_reviews_with_emotions_normalized = normalize_review_emotions(df_reviews_with_emotions, with_neutral=False)`

# Scripts to create and load our final data
Note: You need at least the files 'final_movies.csv' and 'final_reviews.csv' and 'movie.metadata.tsv' (More information in the README)

In [None]:
df_movies, df_language, df_countries, df_genres, df_reviews = load_final_movies_and_reviews()

df_movies_with_emotions = merge_df_with_emotions_tsv(
    df_movies, 
    file_name='plot_emotions.tsv', 
    prefix='plot'
)

df_movies_with_emotions_normalized = normalize_total_plot_emotions(
    df_movies_with_emotions, 
    with_neutral=False
)

df_reviews_with_emotions = merge_df_with_emotions_tsv(
    df_reviews, 
    file_name='review_emotions.tsv', 
    prefix='review', 
    is_review=True)

df_reviews_with_emotions_normalized = normalize_review_emotions(
    df_reviews_with_emotions, 
    with_neutral=False
)
df_characters = load_character_metadata()[["wikipedia_ID", "actor_name", "freebase_ID_actor"]].dropna(axis=0)
df_main_genres = get_genres_merged(df_genres)

plot_emotions_df = read_tsv_predicted_emotions('plot_emotions.tsv')
emotions_split_df = split_movies_emotions_and_genres(plot_emotions_df, df_genres)

In [None]:
print("Our dataset has {} different movies".format(len(df_movies_with_emotions_normalized)))
print("Without combining any genres, we have {} different genres".format(df_genres.shape[1]))
print("We also have {} different countries".format(df_countries.shape[1]))
print("Finally, we have {} different languages".format(df_language.shape[1]))

In [None]:
plot_emotion_distribution(df_movies_with_emotions_normalized, df_main_genres, is_review=False, filename="emotion_plot_genres")
plot_emotion_distribution(df_reviews_with_emotions_normalized, df_main_genres, is_review=True, filename="emotion_reviews_genres")

In [None]:
emotion_distribution_by_movie_rating(df_movies_with_emotions_normalized, is_review=False, filename="emotion_plot_rating_bins")
emotion_distribution_by_movie_rating(df_movies_with_emotions_normalized, df_reviews_with_emotions_normalized, is_review=True, filename="emotion_reviews_rating_bins")

In [None]:
get_best_k_clustered_movie_emotional_type(normalize_total_plot_emotions(df_movies_with_emotions), False, 2, 10);
get_best_k_clustered_movie_emotional_type(normalize_review_emotions(df_reviews_with_emotions), True, 2, 10);

In [None]:
plot_emotions_mean_genres, clusters_plots = plot_clustered_movie_emotional_type(normalize_total_plot_emotions(df_movies_with_emotions), df_genres, False, k=2, clusters_color = {0: "steelblue", 1: "palevioletred"})
reviews_emotions_mean_genres, clusters_reviews = plot_clustered_movie_emotional_type(normalize_review_emotions(df_reviews_with_emotions), df_genres, True, clusters_color = {0: "palevioletred", 1: "steelblue"}, clusters_col_subplot = {1:1, 0:2})

In [None]:
correlations = [(0, 1), (1,0)]
for correlation in correlations:
    plot_cluster = plot_emotions_mean_genres.drop("cluster", axis=1)[plot_emotions_mean_genres["cluster"] == correlation[0]]
    review_cluster = reviews_emotions_mean_genres.drop("cluster", axis=1)[reviews_emotions_mean_genres["cluster"] == correlation[1]]

    spearmanr_corr = spearmanr(plot_cluster.values[0], review_cluster.values[0])
    pearsonr_corr = pearsonr(plot_cluster.values[0], review_cluster.values[0])

    print(f"The spearmanr correlation for the cluster {correlation[0]} from the plot emotions clustering and the cluster {correlation[1]} from the reviews emotions clustering is {spearmanr_corr.statistic} with pvalue {spearmanr_corr.pvalue}")
    print(f"The pearsonr correlation for the cluster {correlation[0]} from the plot emotions clustering and the cluster {correlation[1]} from the reviews emotions clustering is {pearsonr_corr.statistic} with pvalue {pearsonr_corr.pvalue}")

In [None]:
plot_3D_actor_plot_emotion(df_characters, df_movies_with_emotions)
plot_3D_actor_review_emotion(df_characters, df_reviews_with_emotions)

In [None]:
df_emotion_max_actor_plots, df_emotion_max_actor_reviews = plot_actors_emotion_selector(df_characters, df_movies_with_emotions, df_reviews_with_emotions)

In [None]:
print("Top actors plot emotions:")
get_actors_name_and_statistics(df_emotion_max_actor_plots, df_characters, normalize_total_plot_emotions(df_movies_with_emotions, with_neutral=False))

In [None]:
print("Top actors plot reviews:")
get_actors_name_and_statistics(df_emotion_max_actor_reviews, df_characters, normalize_total_plot_emotions(df_movies_with_emotions, with_neutral=False))

In [None]:
plot_world_map_emotion_by_genre(df_movies_with_emotions_normalized, df_countries, is_reviews=False)
plot_world_map_emotion_by_genre(df_reviews_with_emotions_normalized, df_countries, is_reviews=True)
plot_world_map_average_rating(df_movies_with_emotions_normalized, df_countries)
plot_ratings_by_most_dominant_emotion(df_reviews_with_emotions)

In [None]:
generate_word_clouds_by_emotion(df_movies_with_emotions_normalized)

In [15]:
genres_list = df_main_genres.drop("wikipedia_ID", axis=1).columns

In [None]:
plot_heat_map_transitions_plotly(emotions_split_df)
for genre in genres_list:
    plot_heat_map_transitions_plotly(emotions_split_df, genre=genre)

In [None]:
plot_separated_sankey_plotly(emotions_split_df)
for genre in genres_list:
    plot_separated_sankey_plotly(emotions_split_df, genre=genre)

In [None]:
df_emotions_by_genre_time = construct_emotions_by_genre_and_time_df(plot_emotions_df, df_genres)
plot_bar_and_scatter_emotion_evolution(df_emotions_by_genre_time, "All Genres", all_genres=True)
for genre in genres_list:
    plot_bar_and_scatter_emotion_evolution(df_emotions_by_genre_time, genre, all_genres=False)

In [None]:
periodic_emotions_diff_plot = plot_variation("plot", df_movies_with_emotions_normalized)

In [None]:
periodic_emotions_diff_review = plot_variation("review", df_reviews_with_emotions_normalized, df_movies)

In [None]:
corr_p_value_plot_periods(periodic_emotions_diff_plot, periodic_emotions_diff_review)

In [None]:
generate_emotion_genre_heatmap(df_main_genres, df_movies_with_emotions_normalized)

In [None]:
plot_genres_proportions_sorted(df_genres, False)

In [None]:
plot_genres_proportions_sorted(df_main_genres, True)