In [1]:
import pandas as pd
from src.utils.initial_plots_data import plot_movies_over_time_months, plot_movies_over_time_years, plot_pie_genres_proportion, plot_donut_top_countries, show_top_10_words_per_emotion
from src.data.load_final_data import load_final_movies_and_reviews
from src.data.data_wrangling import load_and_clean_movies_df
from src.data.load_data_reviews_kaggle import load_movie_reviews_kaggle
from src.data.load_data_non_commercial_imdb import load_imdb_id_wikipedia_id, load_imdb_average_reviews
from src.data.normalize_emotions import normalize_total_plot_emotions, normalize_review_emotions
from src.scripts.scrap_date import scrap_years_months_movies, get_final_dates, load_scrapped_dates, save_final_dates
from src.scripts.scrap_reviews import scrape_reviews
from src.models.predict_emotions import predict_emotions_to_tsv, merge_df_with_emotions_tsv, predict_emotions_to_tsv, merge_df_with_emotions_tsv
from src.data.merge_genres import *
from src.utils.plot_correlations import *
from src.utils.initial_plots_data import *
from src.utils.plots_month_trends import*
from src.utils.plots_genres import*

# If you have any problem with NLTK restart your kernel and run this:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')


# Make sure that if any dependencies changes it will be reflected in the notebook (From the ML course)
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guillaumevitalis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guillaumevitalis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


### Scripts to create and process our datasets 
(We do not recommand running theses functions. We ran some of them for multiple days):

Load movie metadata:

`df_movies_metadata, df_movies_language, df_movies_countries, df_movies_genres = load_and_clean_movies_df()`

Load movie reviews from kaggle:

`df_reviews_kaggle = load_movie_reviews_kaggle(df_movies_metadata)`

Load mapping IMDB id to wikipedia ID:

`df_mapping_imdb_id_wikipedia_id = load_imdb_id_wikipedia_id(df_movies_metadata)`

Load movie average reviews from the non-commercial IMDB:

`df_imdb_average_reviews = load_imdb_average_reviews(df_mapping_imdb_id_wikipedia_id)`

### Predict emotions

Compute plot emotions:

`predict_emotions_to_tsv(df_movies, column='plot', file_name='plot_emotions.tsv')`

Merging df_movies with plot_emotions.tsv:

`df_movies_with_emotions = merge_df_with_emotions_tsv(df_movies, file_name='plot_emotions.tsv', prefix='plot')`

Compute review emotions:
`predict_emotions_to_tsv(df_reviews, column='review_detail', file_name='review_emotions.tsv, is_review=True')`

Merging df_reviews with review_emotions.tsv:

`df_reviews_with_emotions = merge_df_with_emotions_tsv(df_reviews, file_name='review_emotions.tsv', prefix='review', is_review=True)`

### Scrap more years and months:
We scrap our years and months data using:

`scrap_years_months_movies(df_movies_metadata)`

Then we load the resulting csv and combine both data (from both the original dataset and scrapped data):

`save_final_dates(get_final_dates(load_scrapped_dates()))`

### Scrap more reviews:

`scrape_reviews(df_mapping_imdb_id_wikipedia_id)`

### Scripts to normalize emotion scores
We normalize the plot emotion scores using: 

`df_movies_with_emotions_normalized = normalize_total_plot_emotions(df_movies_with_emotions, with_neutral=False)`

We normalize the review emotion scores using: 

`df_reviews_with_emotions_normalized = normalize_review_emotions(df_reviews_with_emotions, with_neutral=False)`

# Scripts to create and load our final data
Note: You need at least the files 'final_movies.csv' and 'final_reviews.csv' and 'movie.metadata.tsv' (More information in the README)

In [2]:
df_movies, df_language, df_countries, df_genres, df_reviews = load_final_movies_and_reviews()

In [3]:
df_movies_with_emotions = merge_df_with_emotions_tsv(
    df_movies, 
    file_name='plot_emotions.tsv', 
    prefix='plot'
)

In [4]:
df_movies_with_emotions_normalized = normalize_total_plot_emotions(
    df_movies_with_emotions, 
    with_neutral=False
)

In [5]:
df_reviews_with_emotions = merge_df_with_emotions_tsv(
    df_reviews, 
    file_name='review_emotions.tsv', 
    prefix='review', 
    is_review=True)

In [6]:
df_reviews_with_emotions_normalized = normalize_review_emotions(
    df_reviews_with_emotions, 
    with_neutral=False)

In [7]:
df_reviews_with_emotions_normalized=df_reviews_with_emotions_normalized.select_dtypes(exclude=['object'])
df_reviews_with_emotions_normalized = df_reviews_with_emotions_normalized.groupby(by='wikipedia_ID').mean()
df_reviews_with_emotions_normalized = pd.merge(df_reviews_with_emotions_normalized, df_movies, on='wikipedia_ID', how='inner')

In [8]:
periodic_emotions_diff_plot = plot_variation("plot", df_movies_with_emotions_normalized)

In [9]:
periodic_emotions_diff_review = plot_variation("review", df_reviews_with_emotions_normalized)

In [10]:
corr_p_value_plot_periods(periodic_emotions_diff_plot, periodic_emotions_diff_review)

In [11]:
df_main_genres = get_genres_merged(df_genres)
generate_emotion_genre_heatmap(df_main_genres, df_movies_with_emotions_normalized)

In [12]:
plot_genres_proportions_sorted(df_genres, False)

In [13]:
plot_genres_proportions_sorted(df_main_genres, True)