# Importing Libraries

In [69]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import ast
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import re
from fuzzywuzzy import process
import dask.dataframe as dd
import os

# CSVs Loading

In [2]:
# Load the DataFrame from a CSV file
reviews = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/reviews_emotions.csv')

In [3]:
reviews.head()

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,predicted_moods
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,"[('excitement', 35.543839830098335), ('approva..."
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,"[('disapproval', 56.05641547237329), ('annoyan..."
2,city_hunter_shinjuku_private_eyes,City Hunter: Shinjuku Private Eyes,2590987,2019-05-28,Reuben Baron,False,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,The choreography is so precise and lifelike at...,"[('admiration', 80.08288365216127), ('approval..."
3,city_hunter_shinjuku_private_eyes,City Hunter: Shinjuku Private Eyes,2558908,2019-02-14,Matt Schley,False,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,The films outoftouch attempts at humor may fin...,"[('amusement', 22.690977576957067), ('realizat..."
4,dangerous_men_2015,Dangerous Men,2504681,2018-08-29,Pat Padua,False,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,Its clumsy determination is endearing and some...,"[('amusement', 49.197768434566136), ('admirati..."


In [4]:
movies = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/clean_movies.csv')

In [5]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year
0,love_lies,"Love, Lies",43.0,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown
1,adrift_2018,Adrift,65.0,69.0,120.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
2,adrift_2018,Adrift,65.0,69.0,120.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
3,adrift_2018,Adrift,65.0,69.0,120.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
4,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947


In [6]:
movies['id'].nunique()

67604

# Data Wrangling

### Movies

#### Group by id and transform the genres into a list:

In [7]:
movies_grouped = movies.groupby('id')['genre'].apply(list).reset_index()

# Merge the result back into the original DataFrame, keeping all columns
movies = movies.drop(columns=['genre']).drop_duplicates(subset=['id']).merge(movies_grouped, on='id', how='left')

In [8]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre
0,love_lies,"Love, Lies",43.0,65.76,120.0,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,[Drama]
1,adrift_2018,Adrift,65.0,69.0,120.0,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[Adventure, Drama, Romance]"
2,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[Crime, Drama]"
3,garden_murder_case,The Garden Murder Case,55.67,65.76,61.0,English,Edwin L. Marin,Unknown,2016,[Thriller]
4,margarita_happy_hour,Margarita Happy Hour,55.67,76.0,98.0,English,Ilya Chaiken,Ilya Chaiken,2002,[Drama]


In [9]:
movies.loc[movies['title'] == 'How It Ends']

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre
13046,how_it_ends_2021,How It Ends,42.0,68.0,82.0,English,"Zoe Lister Jones,Daryl Wein","Daryl Wein,Zoe Lister Jones",2021,"[Comedy, Drama]"
47022,how_it_ends,How It Ends,17.0,17.0,113.0,English,David M. Rosenthal,Brooks McLaren,2018,"[Sci-fi, Thriller, Action, Adventure]"


### Reviews

#### Calculating the average emotions per movie:

### Explode and Split the Data
First, explode and split the predicted_moods column into emotion and probability:

In [10]:
# Convert tuple strings into actual tuples
reviews['predicted_moods'] = reviews['predicted_moods'].apply(ast.literal_eval)

# Explode the column
reviews_exploded = reviews.explode('predicted_moods')

# Split the tuples into separate columns
reviews_exploded[['emotion', 'probability']] = pd.DataFrame(reviews_exploded['predicted_moods'].tolist(), index=reviews_exploded.index)
reviews_exploded = reviews_exploded.drop(columns=['predicted_moods'])

# Convert 'probability' to numeric
reviews_exploded['probability'] = pd.to_numeric(reviews_exploded['probability'])

# Verify the result
reviews_exploded

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,excitement,35.543840
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,approval,33.460199
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,admiration,14.496543
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,disapproval,56.056415
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,annoyance,16.554233
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,annoyance,29.755378
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,disapproval,16.977576
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,amusement,66.380384
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,joy,22.703058


### Normalize Probabilities to Sum to 100%

In [11]:
# Normalize probabilities to sum to 100%
sum_probabilities = reviews_exploded.groupby('reviewId')['probability'].transform('sum')
reviews_exploded['probability_normalized'] = (reviews_exploded['probability'] / sum_probabilities) * 100

# Round the normalized probabilities to 4 decimal places
reviews_exploded['probability_normalized'] = reviews_exploded['probability_normalized'].round(4)

# Verify the result
reviews_exploded

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability,probability_normalized
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,excitement,35.543840,42.5672
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,approval,33.460199,40.0718
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,admiration,14.496543,17.3610
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,disapproval,56.056415,70.6446
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,annoyance,16.554233,20.8623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,annoyance,29.755378,37.8302
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,disapproval,16.977576,21.5848
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,amusement,66.380384,71.3199
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,joy,22.703058,24.3925


In [12]:
# Verify the sum of normalized probabilities
sum_normalized = reviews_exploded.groupby('reviewId')['probability_normalized'].sum()

# Round the sums to 2 decimal places for display
sum_normalized = sum_normalized.round(2)

# Check unique values
print(sum_normalized.unique())

[100.]


## group by id and emotion and calculate the mean probability

In [13]:
# Agrupar por 'id' e 'emotion', calculando a média das probabilidades
reviews_grouped = reviews_exploded.groupby(['id', 'emotion'])['probability_normalized'].mean().reset_index()

# Verify the result
reviews_grouped.head(10)

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,admiration,43.152
1,$5_a_day,amusement,33.83
2,$5_a_day,annoyance,36.675
3,$5_a_day,approval,34.83245
4,$5_a_day,disappointment,32.6701
5,$5_a_day,disapproval,50.40925
6,$5_a_day,joy,21.3234
7,$5_a_day,optimism,11.2248
8,$5_a_day,realization,7.4892
9,009_re_cyborg,admiration,49.29476


In [14]:
pd.set_option('display.max_colwidth', None)

reviews_exploded.loc[reviews_exploded['id'] == '$5_a_day']

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability,probability_normalized
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,disapproval,60.371897,70.1637
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,approval,19.228371,22.3471
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,realization,6.44405,7.4892
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,annoyance,24.614549,36.675
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,disappointment,21.926653,32.6701
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,disapproval,20.574082,30.6548
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,approval,37.7846,47.3178
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,admiration,33.104835,41.4574
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,optimism,8.963305,11.2248
1213417,$5_a_day,Five Dollars a Day,1780984,2008-11-16,David Nusair,False,fresh,Reel Film Reviews,...very amusing and agreeable...,POSITIVE,very amusing and agreeable,admiration,40.27613,44.8466


In [15]:
# Selecionar as 3 emoções com as maiores médias para cada filme
top_3_emotions = reviews_grouped.groupby('id').apply(
    lambda x: x.nlargest(3, 'probability_normalized')
).reset_index(drop=True)

In [16]:
top_3_emotions

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,disapproval,50.40925
1,$5_a_day,admiration,43.15200
2,$5_a_day,annoyance,36.67500
3,009_re_cyborg,disapproval,56.85230
4,009_re_cyborg,annoyance,50.76375
...,...,...,...
202807,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,approval,58.79150
202808,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,fear,33.92220
202809,zz_top_that_little_ol_band_from_texas,amusement,57.87560
202810,zz_top_that_little_ol_band_from_texas,joy,56.15785


In [17]:
# Normalizar as probabilidades para somar 100% para cada filme
top_3_emotions['probability_normalized'] = top_3_emotions.groupby('id')['probability_normalized'].transform(
    lambda x: (x / x.sum()) * 100
)

# Exibir o resultado
print(top_3_emotions.head())

              id      emotion  probability_normalized
0       $5_a_day  disapproval               38.706005
1       $5_a_day   admiration               33.133632
2       $5_a_day    annoyance               28.160362
3  009_re_cyborg  disapproval               36.023280
4  009_re_cyborg    annoyance               32.165396


In [18]:
top_3_emotions

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,disapproval,38.706005
1,$5_a_day,admiration,33.133632
2,$5_a_day,annoyance,28.160362
3,009_re_cyborg,disapproval,36.023280
4,009_re_cyborg,annoyance,32.165396
...,...,...,...
202807,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,approval,38.438228
202808,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,fear,22.178533
202809,zz_top_that_little_ol_band_from_texas,amusement,35.673589
202810,zz_top_that_little_ol_band_from_texas,joy,34.614795


In [19]:
# Group by 'id' and create a list of tuples (emotion, probability_normalized)
new_df = top_3_emotions.groupby('id').apply(
    lambda x: list(zip(x['emotion'], x['probability_normalized'])))
new_df = new_df.reset_index()
new_df.columns = ['id', 'emotions']

new_df

Unnamed: 0,id,emotions
0,$5_a_day,"[(disapproval, 38.70600543243529), (admiration, 33.1336321492672), (annoyance, 28.16036241829752)]"
1,009_re_cyborg,"[(disapproval, 36.023279538211014), (annoyance, 32.165396240044096), (confusion, 31.81132422174489)]"
2,00_mhz,"[(annoyance, 41.840924718059824), (admiration, 36.49896509094363), (joy, 21.660110190996555)]"
3,1,"[(admiration, 61.524699999999996), (approval, 35.5203), (love, 2.955)]"
4,1-day,"[(admiration, 41.199610661687295), (amusement, 32.26985631655974), (disappointment, 26.530533021752966)]"
...,...,...
67599,zus_and_zo_2003,"[(desire, 40.01421221408237), (amusement, 30.742439443586793), (joy, 29.243348342330837)]"
67600,zvenigora,"[(confusion, 71.48010000000001), (disapproval, 17.093200000000003), (annoyance, 11.426700000000002)]"
67601,zwei_mutter_2013,"[(approval, 47.60089193324174), (admiration, 30.408156407361503), (realization, 21.990951659396757)]"
67602,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,"[(sadness, 39.38323877121109), (approval, 38.43822776765831), (fear, 22.178533461130584)]"


In [20]:
# Merge movies and new_df

merged_df = movies.merge(new_df, on='id', how='left')

merged_df

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre,emotions
0,love_lies,"Love, Lies",43.00,65.76,120.0,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,[Drama],"[(admiration, 78.20800919346894), (joy, 10.896268781721222), (approval, 10.89572202480984)]"
1,adrift_2018,Adrift,65.00,69.00,120.0,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[Adventure, Drama, Romance]","[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
2,1035316-born_to_kill,Born to Kill,74.00,83.00,92.0,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[Crime, Drama]","[(admiration, 41.51550238655111), (disgust, 38.18461517417403), (disappointment, 20.299882439274857)]"
3,garden_murder_case,The Garden Murder Case,55.67,65.76,61.0,English,Edwin L. Marin,Unknown,2016,[Thriller],"[(amusement, 47.11795941712488), (admiration, 30.561675917621006), (approval, 22.320364665254107)]"
4,margarita_happy_hour,Margarita Happy Hour,55.67,76.00,98.0,English,Ilya Chaiken,Ilya Chaiken,2002,[Drama],"[(desire, 45.54859553528897), (admiration, 29.028624793560592), (joy, 25.42277967115044)]"
...,...,...,...,...,...,...,...,...,...,...,...
67599,operation_goldenshell,Operation Goldenshell (Operación Concha),55.67,65.76,88.0,Unknown,Antonio Cuadri,Unknown,Unknown,[Unknown],"[(amusement, 66.48219999999999), (realization, 20.131499999999996), (approval, 13.386299999999999)]"
67600,stag_night_of_the_dead,Stag Night of the Dead,43.00,65.76,81.0,English,Neil Jones,Neil Jones,2016,"[Horror, Action, Comedy]","[(approval, 60.283699999999996), (realization, 27.470399999999994), (admiration, 12.245899999999999)]"
67601,fun_size,Fun Size,47.00,25.00,86.0,English,Josh Schwartz,Max Werner,2012,"[Holiday, Comedy]","[(confusion, 48.71943200404655), (disappointment, 26.36692449980985), (amusement, 24.913643496143578)]"
67602,dassehra,Dassehra,55.67,65.76,131.0,Hindi,Manish Vatsalya,Saurabh Choudhary,2019,"[Action, Thriller]","[(approval, 71.40657548299487), (admiration, 20.783216447748366), (annoyance, 7.810208069256784)]"


In [21]:
merged_df.isna().sum()

id                  0
title               0
audienceScore       0
tomatoMeter         0
runtimeMinutes      0
originalLanguage    0
director            0
writer              0
release_year        0
genre               0
emotions            0
dtype: int64

In [22]:
merged_df.to_csv("movies_emotions.csv", index=False)

In [23]:
movies = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/movies_emotions.csv')

In [24]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre,emotions
0,love_lies,"Love, Lies",43.0,65.76,120.0,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,['Drama'],"[('admiration', 78.20800919346894), ('joy', 10.896268781721222), ('approval', 10.89572202480984)]"
1,adrift_2018,Adrift,65.0,69.0,120.0,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"['Adventure', 'Drama', 'Romance']","[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]"
2,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"['Crime', 'Drama']","[('admiration', 41.51550238655111), ('disgust', 38.18461517417403), ('disappointment', 20.299882439274857)]"
3,garden_murder_case,The Garden Murder Case,55.67,65.76,61.0,English,Edwin L. Marin,Unknown,2016,['Thriller'],"[('amusement', 47.11795941712488), ('admiration', 30.561675917621006), ('approval', 22.320364665254107)]"
4,margarita_happy_hour,Margarita Happy Hour,55.67,76.0,98.0,English,Ilya Chaiken,Ilya Chaiken,2002,['Drama'],"[('desire', 45.54859553528897), ('admiration', 29.028624793560592), ('joy', 25.42277967115044)]"


normalização dos nomes dos filmes:

In [25]:
# Function to normalize names
def normalize_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)  # Remove special characters
    name = name.lower().strip()  # Convert to lowercase and remove extra spaces
    return name

In [26]:
# Creating normalized title colunm in the movies df

movies['title_normalized'] = movies['title'].apply(normalize_name)

#### delete duplicated normalized titles:

In [27]:
# Step 1: Group by 'id' and create the 'count' column
# Group by 'id' and count occurrences in the reviews table
reviews_count = reviews['id'].value_counts().reset_index()
reviews_count.columns = ['id', 'count']

# Add the 'count' column to the movies DataFrame
movies = movies.merge(reviews_count, on='id', how='left')

# Fill NaN values with 0 (movies with no reviews)
movies['count'] = movies['count'].fillna(0)

# Step 2: Check for duplicate titles
# Create a DataFrame with one row per movie (using the first occurrence of each 'id')
movies_unique = movies.drop_duplicates(subset=['id'], keep='first')

# Check for duplicate normalized titles
duplicates = movies_unique[movies_unique['title_normalized'].duplicated(keep=False)]
print("Duplicate titles (considering one entry per movie):")
print(duplicates[['id', 'title_normalized']].sort_values(by='title_normalized'))

# Step 3: Remove movies with fewer occurrences
# Sort movies by normalized title and review count
movies_sorted = movies.sort_values(by=['title_normalized', 'count'], ascending=[True, False])

# Keep only the first occurrence of each normalized title (the one with the most reviews)
movies_no_duplicates = movies_sorted.drop_duplicates(subset=['title_normalized'], keep='first')

# Check if there are still duplicates
if movies_no_duplicates['title_normalized'].duplicated().any():
    print("There are still duplicate titles after removal.")
else:
    print("All duplicate titles have been removed.")

# Display the final DataFrame
print("\nFinal DataFrame without duplicates:")
print(movies_no_duplicates)

Duplicate titles (considering one entry per movie):
                       id title_normalized
15730              1_2013                1
2681   1-one-human-minute                1
66029              1_2017                1
964                     1                1
50302                 ten               10
...                   ...              ...
58403            zoo_2018              zoo
15094           zoom_2016             zoom
53027           zoom_2006             zoom
40729                zulu             zulu
52700           zulu_2013             zulu

[9566 rows x 2 columns]
All duplicate titles have been removed.

Final DataFrame without duplicates:
                                                                 id  \
49201                                          my_missing_valentine   
53378                                                        00_mhz   
16516                                                 009_re_cyborg   
22292                                         

In [28]:
movies = movies_no_duplicates

In [29]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre,emotions,title_normalized,count
49201,my_missing_valentine,消失的情人節,55.67,65.76,119.0,Chinese,Yu-Hsun Chen,"Yu-Hsun Chen,Yu-Hsun Chen",Unknown,"['Comedy', 'Drama', 'Fantasy', 'Foreign', 'Romance']","[('admiration', 52.16256606223262), ('excitement', 24.98463494276986), ('joy', 22.852798994997528)]",,3
53378,00_mhz,0.0 MHz,33.0,65.76,101.0,Korean,Yoo Sun-Dong,Jang Jak,2020,['Horror'],"[('annoyance', 41.840924718059824), ('admiration', 36.49896509094363), ('joy', 21.660110190996555)]",00 mhz,3
16516,009_re_cyborg,009 Re: Cyborg,43.0,23.0,103.0,Japanese,Kenji Kamiyama,Kenji Kamiyama,2015,"['Action', 'Sci-fi', 'Animation']","[('disapproval', 36.023279538211014), ('annoyance', 32.165396240044096), ('confusion', 31.81132422174489)]",009 re cyborg,13
22292,45,0.45,38.0,65.76,101.0,English,Gary Lennon,Gary Lennon,2020,"['Crime', 'Drama', 'Thriller']","[('annoyance', 80.9424), ('anger', 14.1899), ('approval', 4.8677)]",045,1
66029,1_2017,1%,82.0,47.0,88.0,Unknown,Stephen McCallum,Unknown,Unknown,['Unknown'],"[('admiration', 40.5276835242493), ('disappointment', 30.096030871256218), ('approval', 29.376285604494484)]",1,17


In [30]:
movies.shape

(61999, 13)

In [31]:
movies[movies['id'] == '$5_a_day']

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,release_year,genre,emotions,title_normalized,count
60400,$5_a_day,Five Dollars a Day,49.0,65.76,98.0,English,Nigel Cole,"Neal H. Dobrofsky,Tippi Dobrofsky",2010,['Comedy'],"[('disapproval', 38.70600543243529), ('admiration', 33.1336321492672), ('annoyance', 28.16036241829752)]",five dollars a day,4


### Transformar os Dados para o Formato Wide


In [32]:
# Transformar o DataFrame para o formato wide
reviews_wide = top_3_emotions.pivot(index='id', columns='emotion', values='probability_normalized')

# Preencher valores ausentes com 0
reviews_wide = reviews_wide.fillna(0)

reviews_wide = reviews_wide.round(3)  # Reduzir para 3 casas decimais

# Exibir o DataFrame transformado
reviews_wide.head()

emotion,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$5_a_day,33.134,0.0,0.0,28.16,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009_re_cyborg,0.0,0.0,0.0,32.165,0.0,0.0,31.811,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00_mhz,36.499,0.0,0.0,41.841,0.0,0.0,0.0,0.0,0.0,0.0,...,21.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,61.525,0.0,0.0,0.0,35.52,0.0,0.0,0.0,0.0,0.0,...,0.0,2.955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-day,41.2,32.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
reviews_wide.shape

(67604, 27)

#### Removing IDs that are not present in the movies DataFrame from the reviews_wide DataFrame

In [34]:
# Filtrar reviews_wide para manter apenas os IDs presentes em movies
reviews_wide = reviews_wide.loc[reviews_wide.index.isin(movies['id'])]

print(reviews_wide.shape)

reviews_wide.head()

(61999, 27)


emotion,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$5_a_day,33.134,0.0,0.0,28.16,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009_re_cyborg,0.0,0.0,0.0,32.165,0.0,0.0,31.811,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00_mhz,36.499,0.0,0.0,41.841,0.0,0.0,0.0,0.0,0.0,0.0,...,21.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-day,41.2,32.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-violent-women,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cálculo de similaridade

uso de PCA (Principal Component Analysis) para reduzir a dimensionalidade dos dados antes de calcular a similaridade:

In [35]:
# Function to calculate similarity matrix based on emotion data
def calculate_similarity_matrix(reviews_wide):
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=0.97)  # Keep 97% of the variance
    reviews_reduced = pca.fit_transform(reviews_wide.values)
    
    # Calculate cosine similarity
    sim_matrix = cosine_similarity(reviews_reduced)
    
    # Convert to DataFrame
    sim_df = pd.DataFrame(sim_matrix, index=reviews_wide.index, columns=reviews_wide.index)
    
    return sim_df

In [36]:
sim_df = calculate_similarity_matrix(reviews_wide)

In [37]:
sim_df.head()

id,$5_a_day,009_re_cyborg,00_mhz,1-day,10-violent-women,1000013_12_angry_men,10000292-rat,10000594-guardian,10000604-porgy_and_bess,10000633-corrections,...,zu_warriors,zubaan,zulfiqar,zulu,zulu_dawn,zus_and_zo_2003,zvenigora,zwei_mutter_2013,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,zz_top_that_little_ol_band_from_texas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$5_a_day,1.0,0.642029,0.548673,0.015329,0.082268,-0.032376,-0.253434,0.4827,0.130828,-0.129166,...,-0.156126,-0.026932,0.141549,0.018491,-0.223483,-0.172544,0.151799,-0.256245,-0.356043,-0.40209
009_re_cyborg,0.642029,1.0,0.289133,-0.30766,0.199621,-0.203014,-0.042702,0.536749,-0.238149,0.108952,...,0.032571,0.245216,0.316069,0.253954,-0.387281,0.021206,0.766338,-0.396007,-0.110629,-0.138062
00_mhz,0.548673,0.289133,1.0,0.06784,-0.193498,0.002636,-0.084409,0.322849,0.182596,-0.01843,...,-0.161924,-0.011084,0.291749,0.046869,-0.178833,0.039386,0.006185,-0.213521,-0.358634,-0.099164
1-day,0.015329,-0.30766,0.06784,1.0,0.255635,0.471322,0.489206,-0.296265,0.575874,-0.561338,...,-0.018815,-0.081025,0.148875,-0.003417,-0.276043,0.083998,-0.222214,-0.316739,-0.533836,-0.049871
10-violent-women,0.082268,0.199621,-0.193498,0.255635,1.0,-0.22183,0.714519,0.207736,0.323433,-0.149357,...,0.558869,-0.086745,0.68798,-0.148026,-0.393194,-0.012994,0.079826,-0.399594,-0.134575,-0.161169


## Saving and loading the sim_df in parquet

In [32]:
'''

# Number of parts to divide the DataFrame
num_parts = 10

# Size of each part
part_size = len(sim_df) // num_parts

# Save each part individually with Dask
for i in range(num_parts):
    start = i * part_size
    end = (i + 1) * part_size if i < num_parts - 1 else len(sim_df)
    part = sim_df.iloc[start:end]
    
    # Convert the part to Dask
    dask_part = dd.from_pandas(part, npartitions=1)
    
    # Save the part in Parquet format
    dask_part.to_parquet(f"similarity_matrix_part_{i}.parquet", engine='pyarrow', compression='snappy')
    print(f"Part {i} saved with Dask and Snappy compression.")

    
'''

Part 0 saved with Dask and Snappy compression.
Part 1 saved with Dask and Snappy compression.
Part 2 saved with Dask and Snappy compression.
Part 3 saved with Dask and Snappy compression.
Part 4 saved with Dask and Snappy compression.
Part 5 saved with Dask and Snappy compression.
Part 6 saved with Dask and Snappy compression.
Part 7 saved with Dask and Snappy compression.
Part 8 saved with Dask and Snappy compression.
Part 9 saved with Dask and Snappy compression.


In [62]:
'''

# Base directory where the folders are stored
base_dir = "/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/"

# List all folders matching the pattern "similarity_matrix_part_*.parquet"
folders = [os.path.join(base_dir, f"similarity_matrix_part_{i}.parquet") for i in range(10)]  # Adjust the range if there are more folders

# List all "part.0.parquet" files inside the folders
file_paths = [os.path.join(folder, "part.0.parquet") for folder in folders]

# Check if the files exist
for file in file_paths:
    if not os.path.exists(file):
        print(f"File not found: {file}")
    else:
        print(f"File found: {file}")

# Load the files with Dask
sim_df = dd.read_parquet(file_paths, engine='pyarrow')

'''

File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_0.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_1.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_2.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_3.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_4.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_5.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_6.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_7.parquet/part.0.parquet
File found: /Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix_part_8.parque

## Validating the similarity calculation

In [38]:
# Colunas de interesse
colunas = ['approval', 'amusement', 'surprise']

# Condição: Verifica se os valores nas colunas são maiores que zero
condicao = (reviews_wide[colunas] > 20)

# Filtra as linhas onde pelo menos 3 colunas atendem à condição
linhas_filtradas = reviews_wide[condicao.sum(axis=1) >= 3]

# Exibir o resultado
print(linhas_filtradas)

emotion                  admiration  amusement  anger  annoyance  approval  \
id                                                                           
10008698-king_corn              0.0     34.124    0.0        0.0    31.708   
1006250-dragnet                 0.0     33.403    0.0        0.0    28.819   
1038237-platinum_blonde         0.0     43.619    0.0        0.0    26.868   
1118698-empire                  0.0     30.693    0.0        0.0    28.538   
1120843-hunted                  0.0     36.116    0.0        0.0    31.508   
...                             ...        ...    ...        ...       ...   
used_people                     0.0     36.491    0.0        0.0    30.280   
web_junkie                      0.0     40.971    0.0        0.0    29.956   
wheres_marlowe                  0.0     33.271    0.0        0.0    20.512   
workforce                       0.0     32.062    0.0        0.0    26.473   
wrong_turn_2                    0.0     31.592    0.0        0.0

In [39]:
# veja a matriz e compare a similaridade entre filmes que você sabe que são semelhantes ou 
# diferentes com base nas porcentagens das emoções
# se dois filmes têm porcentagens muito parecidas para as mesmas emoções, a similaridade deve ser alta

# Filmes com porcentagens de emoções parecidas
filme_a = reviews_wide.loc['wrong_turn_2']
filme_b = reviews_wide.loc['10008698-king_corn']

# Verificar similaridade na matriz
similaridade = sim_df.loc['wrong_turn_2', '10008698-king_corn']
print(f"Similaridade: {similaridade}")

Similaridade: 0.9951038671755085


In [40]:
# Filmes com porcentagens de emoções diferentes
filme_a = reviews_wide.loc['katherine_ryan_glitter_room']
filme_b = reviews_wide.loc['009_re_cyborg']

# Verificar similaridade na matriz
similaridade = sim_df.loc['katherine_ryan_glitter_room', '009_re_cyborg']
print(f"Similaridade: {similaridade}")

Similaridade: -0.11369178286342177


In [42]:
# Aplicar PCA para reduzir a dimensionalidade
pca = PCA(n_components=0.97)  # Manter 97% da variância
reviews_reduced = pca.fit_transform(reviews_wide.values)

# Escolha dois filmes para comparar
filme1_id = 'katherine_ryan_glitter_room'  # Substitua pelo ID do primeiro filme
filme2_id = '009_re_cyborg'  # Substitua pelo ID do segundo filme

# Encontre os índices dos filmes na matriz reviews_wide
filme1_idx = reviews_wide.index.get_loc(filme1_id)
filme2_idx = reviews_wide.index.get_loc(filme2_id)

# Selecionar os vetores dos filmes após o PCA
filme1_vector = reviews_reduced[filme1_idx].reshape(1, -1)  # Transforma em formato (1, n_features)
filme2_vector = reviews_reduced[filme2_idx].reshape(1, -1)  # Transforma em formato (1, n_features)

# Calcular a similaridade do cosseno manualmente
similaridade_manual = cosine_similarity(filme1_vector, filme2_vector)[0][0]

# Obter o valor correspondente na matriz sim_df
similaridade_matriz = sim_df.loc[filme1_id, filme2_id]

# Exibir os resultados
print(f"Similaridade manual entre {filme1_id} e {filme2_id}: {similaridade_manual}")
print(f"Similaridade na matriz sim_df: {similaridade_matriz}")

# Verificar se os valores são iguais (ou muito próximos, devido a arredondamentos)
if np.isclose(similaridade_manual, similaridade_matriz, atol=1e-6):
    print("Os valores são iguais (dentro de uma tolerância pequena).")
else:
    print("Os valores são diferentes.")

Similaridade manual entre katherine_ryan_glitter_room e 009_re_cyborg: -0.11369178286342185
Similaridade na matriz sim_df: -0.11369178286342177
Os valores são iguais (dentro de uma tolerância pequena).


## Main recommendation function

In [70]:
def recommend_similar_movies(sim_df, movies, reviews, top_n=5):
    # Criar uma cópia do DataFrame para evitar modificar o original
    movies = movies.copy()

    # Converter a coluna 'genre' de string para lista (se necessário)
    if isinstance(movies['genre'].iloc[0], str):
        movies['genre'] = movies['genre'].apply(ast.literal_eval)

    while True:
        # Ask the user for their favorite movie
        favorite_movie = input("Enter the name of your favorite movie (or type 'exit' to quit): ").strip()

        # Allow the user to exit
        if favorite_movie.lower() == 'exit':
            print("Exiting the recommendation system. Goodbye!")
            return None

        # Normalize the movie name
        favorite_movie_normalized = normalize_name(favorite_movie)

        # Find the movie in the dataset
        matching_movies = movies[movies['title_normalized'] == favorite_movie_normalized]
        if matching_movies.empty:
            print(f"The movie '{favorite_movie}' was not found. Please check the name and try again.")
            continue  
        else:
            favorite_movie_id = matching_movies.iloc[0]['id']
            favorite_movie_title = matching_movies.iloc[0]['title']
            print(f"Found movie: {favorite_movie_title} (ID: {favorite_movie_id})")
            break  

    # Check if the movie is in the similarity matrix
    if favorite_movie_id not in sim_df.columns:
        print(f"Movie ID '{favorite_movie_id}' not found in the similarity matrix.")
        return None

    # Get similarity scores and sort by highest similarity
    movie_similarities = sim_df[favorite_movie_id].sort_values(ascending=False)

    # Remove the movie itself from recommendations
    movie_similarities = movie_similarities.drop(favorite_movie_id, errors='ignore')

    # Get genres of the favorite movie
    favorite_movie_genres = set(matching_movies['genre'].explode().values)
    print(favorite_movie_genres)

    # Filter recommendations by shared genre
    recommended_ids = movies[movies['genre'].apply(lambda genres: any(genre in favorite_movie_genres for genre in genres))]['id'].unique()
    movie_similarities = movie_similarities[movie_similarities.index.isin(recommended_ids)]

    # Get top N recommendations
    top_recommendations = movie_similarities.head(top_n).reset_index()
    top_recommendations.columns = ['id', 'similarity']

    # Merge with movie details
    recommended_movies = top_recommendations.merge(movies, on='id', how='left')

    # Select relevant columns
    result = recommended_movies[['id', 'title', 'director', 'originalLanguage', 'runtimeMinutes', 
                                 'genre', 'release_year', 'tomatoMeter', 'audienceScore', 
                                 'similarity', 'emotions']]

    # Exibir o perfil emocional do filme favorito
    
    favorite_movie_emotions = matching_movies.iloc[0]['emotions']
    print(f"Emotional profile of '{favorite_movie_title}':")  
    if isinstance(favorite_movie_emotions, list):
        print(f"   ❤️ Emotions: {', '.join([f'{mood} ({percentage:.1f}%)' for mood, percentage in favorite_movie_emotions])}")
    else:
        print(f"   ❤️ Emotions: {favorite_movie_emotions}")  
    print("-" * 50)
    
    
    # Print formatted recommendations
    print(f"\nTop {top_n} movie recommendations based on '{favorite_movie_title}':\n")


    # Exibir as recomendações
    for _, row in result.iterrows():
        print(f"🎬 Movie: {row['title']}")
        print(f"   🎬 Director: {row['director']}")
        print(f"   🌍 Language: {row['originalLanguage']}")
        print(f"   ⏳ Duration: {row['runtimeMinutes']} min")
        print(f"   🎭 Genre: {', '.join(row['genre'])}")
        print(f"   📅 Year: {row['release_year']}")
        print(f"   🍅 Tomatometer: {row['tomatoMeter']}%")
        print(f"   🎟️ Audience Score: {row['audienceScore']}%")
        print(f"   🔗 Similarity Score: {row['similarity']:.5f}")
        if isinstance(row['emotions'], list):
            print(f"   ❤️ Emotions: {', '.join([f'{mood} ({percentage:.1f}%)' for mood, percentage in row['emotions']])}")
        else:
            print(f"   ❤️ Emotions: {row['emotions']}")  # Exibe o valor real para depuração
        print("-" * 50)

    return result

# Generate recommendations

In [71]:
recommendations = recommend_similar_movies(sim_df, movies, reviews, top_n=5)

Enter the name of your favorite movie (or type 'exit' to quit): lost in translation
Found movie: Lost in Translation (ID: lost_in_translation)
{'Drama', 'Comedy'}
Emotional profile of 'Lost in Translation':
   ❤️ Emotions: fear (35.9%), admiration (32.3%), remorse (31.9%)
--------------------------------------------------

Top 5 movie recommendations based on 'Lost in Translation':

🎬 Movie: The Tree of Life
   🎬 Director: Terrence Malick
   🌍 Language: English
   ⏳ Duration: 138.0 min
   🎭 Genre: Drama
   📅 Year: 2011
   🍅 Tomatometer: 84.0%
   🎟️ Audience Score: 60.0%
   🔗 Similarity Score: 0.99979
   ❤️ Emotions: fear (35.1%), admiration (32.9%), remorse (32.0%)
--------------------------------------------------
🎬 Movie: The Company Men
   🎬 Director: John Wells
   🌍 Language: English
   ⏳ Duration: 115.0 min
   🎭 Genre: Drama
   📅 Year: 2011
   🍅 Tomatometer: 67.0%
   🎟️ Audience Score: 55.0%
   🔗 Similarity Score: 0.99975
   ❤️ Emotions: fear (35.1%), admiration (33.0%), remorse (

In [72]:
recommendations

Unnamed: 0,id,title,director,originalLanguage,runtimeMinutes,genre,release_year,tomatoMeter,audienceScore,similarity,emotions
0,the_tree_of_life_2011,The Tree of Life,Terrence Malick,English,138.0,[Drama],2011,84.0,60.0,0.999792,"[(fear, 35.12940069162826), (admiration, 32.89273385594293), (remorse, 31.977865452428812)]"
1,the-company-men,The Company Men,John Wells,English,115.0,[Drama],2011,67.0,55.0,0.99975,"[(fear, 35.09066671385157), (admiration, 32.967885455208354), (remorse, 31.941447830940064)]"
2,just_cause,Just Cause,Arne Glimcher,English,102.0,[Drama],1999,26.0,46.0,0.999418,"[(remorse, 34.66364460858055), (fear, 33.58456095375457), (admiration, 31.75179443766488)]"
3,the_goldfinch,The Goldfinch,John Crowley,English,149.0,[Drama],2019,25.0,72.0,0.999332,"[(fear, 37.57821138738658), (admiration, 31.44746016636856), (remorse, 30.974328446244865)]"
4,this_is_not_a_war_story,This Is Not a War Story,Talia Lugacy,English,96.0,[Drama],Unknown,100.0,55.67,0.999046,"[(remorse, 34.05011972384762), (fear, 33.30487621518456), (admiration, 32.645004060967814)]"
