In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import ast
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import re
from fuzzywuzzy import process


# Load the DataFrame from a CSV file
reviews = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/reviews_emotions.csv')

In [2]:
reviews

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,predicted_moods
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,"[('excitement', 35.543839830098335), ('approva..."
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,"[('disapproval', 56.05641547237329), ('annoyan..."
2,city_hunter_shinjuku_private_eyes,City Hunter: Shinjuku Private Eyes,2590987,2019-05-28,Reuben Baron,False,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,The choreography is so precise and lifelike at...,"[('admiration', 80.08288365216127), ('approval..."
3,city_hunter_shinjuku_private_eyes,City Hunter: Shinjuku Private Eyes,2558908,2019-02-14,Matt Schley,False,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,The films outoftouch attempts at humor may fin...,"[('amusement', 22.690977576957067), ('realizat..."
4,dangerous_men_2015,Dangerous Men,2504681,2018-08-29,Pat Padua,False,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,Its clumsy determination is endearing and some...,"[('amusement', 49.197768434566136), ('admirati..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1363574,thor_love_and_thunder,Thor: Love and Thunder,102706151,2022-07-05,Christie Cronan,False,fresh,Raising Whasians,Solid but not totally sold&#44; Thor&#58; Ragn...,POSITIVE,Solid but not totally sold Thor Ragnarok still...,"[('admiration', 46.784670468845704), ('disappo..."
1363575,thor_love_and_thunder,Thor: Love and Thunder,102706150,2022-07-05,Ian Sandwell,False,fresh,Digital Spy,Thor&#58; Love and Thunder is the most enterta...,POSITIVE,Thor Love and Thunder is the most entertaining...,"[('admiration', 36.97271463016962), ('amusemen..."
1363576,thor_love_and_thunder,Thor: Love and Thunder,102706149,2022-07-05,Lauren LaMagna,False,fresh,Next Best Picture,&quot;Thor&#58; Love and Thunder&quot; is a st...,POSITIVE,Thor Love and Thunder is a stepup from Thor Ra...,"[('approval', 51.977043591835596), ('love', 18..."
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,"[('disappointment', 31.922172596719218), ('ann..."


In [3]:
movies = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/clean_movies.csv')

In [4]:
movies

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year
0,love_lies,"Love, Lies",43.00,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown
1,adrift_2018,Adrift,65.00,69.00,93.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
2,adrift_2018,Adrift,65.00,69.00,93.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
3,adrift_2018,Adrift,65.00,69.00,93.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018
4,1035316-born_to_kill,Born to Kill,74.00,83.00,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947
...,...,...,...,...,...,...,...,...,...,...
105655,fun_size,Fun Size,47.00,25.00,86.0,Holiday,English,Josh Schwartz,Max Werner,2012
105656,fun_size,Fun Size,47.00,25.00,86.0,Comedy,English,Josh Schwartz,Max Werner,2012
105657,dassehra,Dassehra,55.67,65.76,131.0,Action,Hindi,Manish Vatsalya,Saurabh Choudhary,2019
105658,dassehra,Dassehra,55.67,65.76,131.0,Thriller,Hindi,Manish Vatsalya,Saurabh Choudhary,2019


In [5]:
movies['id'].nunique()

67604

#### Calculating the average emotions per movie:

### Explode and Split the Data
First, explode and split the predicted_moods column into emotion and probability:

In [6]:
# Convert tuple strings into actual tuples
reviews['predicted_moods'] = reviews['predicted_moods'].apply(ast.literal_eval)

# Explode the column
reviews_exploded = reviews.explode('predicted_moods')

# Split the tuples into separate columns
reviews_exploded[['emotion', 'probability']] = pd.DataFrame(reviews_exploded['predicted_moods'].tolist(), index=reviews_exploded.index)
reviews_exploded = reviews_exploded.drop(columns=['predicted_moods'])

# Convert 'probability' to numeric
reviews_exploded['probability'] = pd.to_numeric(reviews_exploded['probability'])

# Verify the result
reviews_exploded

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,excitement,35.543840
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,approval,33.460199
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,admiration,14.496543
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,disapproval,56.056415
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,annoyance,16.554233
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,annoyance,29.755378
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,disapproval,16.977576
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,amusement,66.380384
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,joy,22.703058


### Normalize Probabilities to Sum to 100%

In [7]:
# Normalize probabilities to sum to 100%
sum_probabilities = reviews_exploded.groupby('reviewId')['probability'].transform('sum')
reviews_exploded['probability_normalized'] = (reviews_exploded['probability'] / sum_probabilities) * 100

# Round the normalized probabilities to 4 decimal places
reviews_exploded['probability_normalized'] = reviews_exploded['probability_normalized'].round(4)

# Verify the result
reviews_exploded

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability,probability_normalized
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,excitement,35.543840,42.5672
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,approval,33.460199,40.0718
0,beavers,Beavers,1145982,2003-05-23,Ivan M. Lincoln,False,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,Timed to be just long enough for most youngste...,admiration,14.496543,17.3610
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,disapproval,56.056415,70.6446
1,blood_mask,Blood Mask,1636744,2007-06-02,The Foywonder,False,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,It doesnt matter if a movie costs 300 million ...,annoyance,16.554233,20.8623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,annoyance,29.755378,37.8302
1363577,thor_love_and_thunder,Thor: Love and Thunder,102706148,2022-07-05,Jake Cole,True,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,Across Taika Waititis film a war against the g...,disapproval,16.977576,21.5848
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,amusement,66.380384,71.3199
1363578,thor_love_and_thunder,Thor: Love and Thunder,102706147,2022-07-05,Roger Moore,False,fresh,Movie Nation,It&#8217;s the jokes that make it&#44; with th...,POSITIVE,Its the jokes that make it with the selfmockin...,joy,22.703058,24.3925


In [8]:
# Verify the sum of normalized probabilities
sum_normalized = reviews_exploded.groupby('reviewId')['probability_normalized'].sum()

# Round the sums to 2 decimal places for display
sum_normalized = sum_normalized.round(2)

# Check unique values
print(sum_normalized.unique())

[100.]


## group by id and emotion and calculate the mean probability

In [9]:
# Agrupar por 'id' e 'emotion', calculando a média das probabilidades
reviews_grouped = reviews_exploded.groupby(['id', 'emotion'])['probability_normalized'].mean().reset_index()

# Verify the result
reviews_grouped.head(10)

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,admiration,43.152
1,$5_a_day,amusement,33.83
2,$5_a_day,annoyance,36.675
3,$5_a_day,approval,34.83245
4,$5_a_day,disappointment,32.6701
5,$5_a_day,disapproval,50.40925
6,$5_a_day,joy,21.3234
7,$5_a_day,optimism,11.2248
8,$5_a_day,realization,7.4892
9,009_re_cyborg,admiration,49.29476


In [10]:
pd.set_option('display.max_colwidth', None)

reviews_exploded.loc[reviews_exploded['id'] == '$5_a_day']

Unnamed: 0,id,title,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,cleanedReviewText,emotion,probability,probability_normalized
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,disapproval,60.371897,70.1637
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,approval,19.228371,22.3471
1213414,$5_a_day,Five Dollars a Day,2097498,2012-08-01,Kevin Carr,False,rotten,7M Pictures,"$5 a Day isn't perfect, but it does examine some of the issues that we have when connecting with our parents as adults.",NEGATIVE,5 a Day isnt perfect but it does examine some of the issues that we have when connecting with our parents as adults,realization,6.44405,7.4892
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,annoyance,24.614549,36.675
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,disappointment,21.926653,32.6701
1213415,$5_a_day,Five Dollars a Day,1929215,2010-09-09,Brian Orndorf,False,rotten,DVDTalk.com,"Dreadfully formulaic and absent a thoughtful emotional core, the picture is best valued as a forgettable trifle starring Hollywood's most enduring weirdo.",NEGATIVE,Dreadfully formulaic and absent a thoughtful emotional core the picture is best valued as a forgettable trifle starring Hollywoods most enduring weirdo,disapproval,20.574082,30.6548
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,approval,37.7846,47.3178
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,admiration,33.104835,41.4574
1213416,$5_a_day,Five Dollars a Day,1924503,2010-08-19,Jules Brenner,False,fresh,Cinema Signals,The success of the piece rests on Nivola's calm adaptability as fatherly hi-jinx tries his patience and makes its mark on his heart.,POSITIVE,The success of the piece rests on Nivolas calm adaptability as fatherly hijinx tries his patience and makes its mark on his heart,optimism,8.963305,11.2248
1213417,$5_a_day,Five Dollars a Day,1780984,2008-11-16,David Nusair,False,fresh,Reel Film Reviews,...very amusing and agreeable...,POSITIVE,very amusing and agreeable,admiration,40.27613,44.8466


In [11]:
# Selecionar as 3 emoções com as maiores médias para cada filme
top_3_emotions = reviews_grouped.groupby('id').apply(
    lambda x: x.nlargest(3, 'probability_normalized')
).reset_index(drop=True)

In [12]:
top_3_emotions

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,disapproval,50.40925
1,$5_a_day,admiration,43.15200
2,$5_a_day,annoyance,36.67500
3,009_re_cyborg,disapproval,56.85230
4,009_re_cyborg,annoyance,50.76375
...,...,...,...
202807,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,approval,58.79150
202808,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,fear,33.92220
202809,zz_top_that_little_ol_band_from_texas,amusement,57.87560
202810,zz_top_that_little_ol_band_from_texas,joy,56.15785


In [13]:
# Normalizar as probabilidades para somar 100% para cada filme
top_3_emotions['probability_normalized'] = top_3_emotions.groupby('id')['probability_normalized'].transform(
    lambda x: (x / x.sum()) * 100
)

# Exibir o resultado
print(top_3_emotions.head())

              id      emotion  probability_normalized
0       $5_a_day  disapproval               38.706005
1       $5_a_day   admiration               33.133632
2       $5_a_day    annoyance               28.160362
3  009_re_cyborg  disapproval               36.023280
4  009_re_cyborg    annoyance               32.165396


In [14]:
top_3_emotions

Unnamed: 0,id,emotion,probability_normalized
0,$5_a_day,disapproval,38.706005
1,$5_a_day,admiration,33.133632
2,$5_a_day,annoyance,28.160362
3,009_re_cyborg,disapproval,36.023280
4,009_re_cyborg,annoyance,32.165396
...,...,...,...
202807,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,approval,38.438228
202808,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,fear,22.178533
202809,zz_top_that_little_ol_band_from_texas,amusement,35.673589
202810,zz_top_that_little_ol_band_from_texas,joy,34.614795


In [15]:
# Agrupar por 'id' e criar a lista de tuplas (emotion, probability_normalized)
new_df = top_3_emotions.groupby('id').apply(
    lambda x: list(zip(x['emotion'], x['probability_normalized'])))
new_df = new_df.reset_index()
new_df.columns = ['id', 'emotions']

new_df

Unnamed: 0,id,emotions
0,$5_a_day,"[(disapproval, 38.70600543243529), (admiration, 33.1336321492672), (annoyance, 28.16036241829752)]"
1,009_re_cyborg,"[(disapproval, 36.023279538211014), (annoyance, 32.165396240044096), (confusion, 31.81132422174489)]"
2,00_mhz,"[(annoyance, 41.840924718059824), (admiration, 36.49896509094363), (joy, 21.660110190996555)]"
3,1,"[(admiration, 61.524699999999996), (approval, 35.5203), (love, 2.955)]"
4,1-day,"[(admiration, 41.199610661687295), (amusement, 32.26985631655974), (disappointment, 26.530533021752966)]"
...,...,...
67599,zus_and_zo_2003,"[(desire, 40.01421221408237), (amusement, 30.742439443586793), (joy, 29.243348342330837)]"
67600,zvenigora,"[(confusion, 71.48010000000001), (disapproval, 17.093200000000003), (annoyance, 11.426700000000002)]"
67601,zwei_mutter_2013,"[(approval, 47.60089193324174), (admiration, 30.408156407361503), (realization, 21.990951659396757)]"
67602,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,"[(sadness, 39.38323877121109), (approval, 38.43822776765831), (fear, 22.178533461130584)]"


In [16]:
# merge entre movies e new_df

merged_df = movies.merge(new_df, on='id', how='left')

merged_df

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,emotions
0,love_lies,"Love, Lies",43.00,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,"[(admiration, 78.20800919346894), (joy, 10.896268781721222), (approval, 10.89572202480984)]"
1,adrift_2018,Adrift,65.00,69.00,93.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
2,adrift_2018,Adrift,65.00,69.00,93.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
3,adrift_2018,Adrift,65.00,69.00,93.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
4,1035316-born_to_kill,Born to Kill,74.00,83.00,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[(admiration, 41.51550238655111), (disgust, 38.18461517417403), (disappointment, 20.299882439274857)]"
...,...,...,...,...,...,...,...,...,...,...,...
105655,fun_size,Fun Size,47.00,25.00,86.0,Holiday,English,Josh Schwartz,Max Werner,2012,"[(confusion, 48.71943200404655), (disappointment, 26.36692449980985), (amusement, 24.913643496143578)]"
105656,fun_size,Fun Size,47.00,25.00,86.0,Comedy,English,Josh Schwartz,Max Werner,2012,"[(confusion, 48.71943200404655), (disappointment, 26.36692449980985), (amusement, 24.913643496143578)]"
105657,dassehra,Dassehra,55.67,65.76,131.0,Action,Hindi,Manish Vatsalya,Saurabh Choudhary,2019,"[(approval, 71.40657548299487), (admiration, 20.783216447748366), (annoyance, 7.810208069256784)]"
105658,dassehra,Dassehra,55.67,65.76,131.0,Thriller,Hindi,Manish Vatsalya,Saurabh Choudhary,2019,"[(approval, 71.40657548299487), (admiration, 20.783216447748366), (annoyance, 7.810208069256784)]"


In [17]:
movies[movies['id'] == 'dinosaur_island_2002']

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year


In [18]:
merged_df.isna().sum()

id                  0
title               0
audienceScore       0
tomatoMeter         0
runtimeMinutes      0
genre               0
originalLanguage    0
director            0
writer              0
release_year        0
emotions            0
dtype: int64

In [19]:
merged_df.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,emotions
0,love_lies,"Love, Lies",43.0,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,"[(admiration, 78.20800919346894), (joy, 10.896268781721222), (approval, 10.89572202480984)]"
1,adrift_2018,Adrift,65.0,69.0,93.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
2,adrift_2018,Adrift,65.0,69.0,93.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
3,adrift_2018,Adrift,65.0,69.0,93.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[(admiration, 42.079146438693414), (disappointment, 29.810255887131987), (approval, 28.1105976741746)]"
4,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[(admiration, 41.51550238655111), (disgust, 38.18461517417403), (disappointment, 20.299882439274857)]"


In [20]:
# merged_df.to_csv("movies_emotions.csv", index=False)

In [21]:
movies = pd.read_csv('/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/movies_emotions.csv')

In [22]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,emotions
0,love_lies,"Love, Lies",43.0,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,"[('admiration', 78.20800919346894), ('joy', 10.896268781721222), ('approval', 10.89572202480984)]"
1,adrift_2018,Adrift,65.0,69.0,93.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]"
2,adrift_2018,Adrift,65.0,69.0,93.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]"
3,adrift_2018,Adrift,65.0,69.0,93.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]"
4,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[('admiration', 41.51550238655111), ('disgust', 38.18461517417403), ('disappointment', 20.299882439274857)]"


### Transformar os Dados para o Formato Wide


In [23]:
# Transformar o DataFrame para o formato wide
reviews_wide = top_3_emotions.pivot(index='id', columns='emotion', values='probability_normalized')

# Preencher valores ausentes com 0
reviews_wide = reviews_wide.fillna(0)

reviews_wide = reviews_wide.round(3)  # Reduzir para 3 casas decimais

# Exibir o DataFrame transformado
reviews_wide.head()

emotion,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$5_a_day,33.134,0.0,0.0,28.16,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009_re_cyborg,0.0,0.0,0.0,32.165,0.0,0.0,31.811,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00_mhz,36.499,0.0,0.0,41.841,0.0,0.0,0.0,0.0,0.0,0.0,...,21.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,61.525,0.0,0.0,0.0,35.52,0.0,0.0,0.0,0.0,0.0,...,0.0,2.955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-day,41.2,32.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.531,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cálculo de similaridade

normalização dos nomes dos filmes e a busca aproximada usando a biblioteca fuzzywuzzy:

In [24]:
# Function to normalize names
def normalize_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)  # Remove special characters
    name = name.lower().strip()  # Convert to lowercase and remove extra spaces
    return name

In [25]:
# Function to find the closest matching movie
def find_closest_movie(movie_name, sim_df):
    # Normalize the movie name
    movie_name = normalize_name(movie_name)
    
    # Find the closest match
    match, score = process.extractOne(movie_name, sim_df.index)
    
    if score >= 80:  # Adjust the similarity threshold as needed
        return match  # Return the matched title
    else:
        return None

uso de PCA (Principal Component Analysis) para reduzir a dimensionalidade dos dados antes de calcular a similaridade:

In [26]:
# Function to calculate similarity matrix based on emotion data
def calculate_similarity_matrix(reviews_wide):
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=0.97)  # Keep 97% of the variance
    reviews_reduced = pca.fit_transform(reviews_wide.values)
    
    # Calculate cosine similarity
    sim_matrix = cosine_similarity(reviews_reduced)
    
    # Convert to DataFrame
    sim_df = pd.DataFrame(sim_matrix, index=reviews_wide.index, columns=reviews_wide.index)
    
    return sim_df

In [27]:
# Create normalized title colunm in every df

reviews['title_normalized'] = reviews['title'].apply(normalize_name)
movies['title_normalized'] = movies['title'].apply(normalize_name)

In [28]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,emotions,title_normalized
0,love_lies,"Love, Lies",43.0,65.76,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",Unknown,"[('admiration', 78.20800919346894), ('joy', 10.896268781721222), ('approval', 10.89572202480984)]",love lies
1,adrift_2018,Adrift,65.0,69.0,93.0,Adventure,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]",adrift
2,adrift_2018,Adrift,65.0,69.0,93.0,Drama,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]",adrift
3,adrift_2018,Adrift,65.0,69.0,93.0,Romance,English,Baltasar Kormákur,"Aaron Kandell,Jordan Kandell,David Branson Smith",2018,"[('admiration', 42.079146438693414), ('disappointment', 29.810255887131987), ('approval', 28.1105976741746)]",adrift
4,1035316-born_to_kill,Born to Kill,74.0,83.0,92.0,Crime,English,Robert Wise,"Eve Greene,Richard Macaulay",1947,"[('admiration', 41.51550238655111), ('disgust', 38.18461517417403), ('disappointment', 20.299882439274857)]",born to kill


In [29]:
sim_df = calculate_similarity_matrix(reviews_wide)

In [31]:
sim_df.head()

id,$5_a_day,009_re_cyborg,00_mhz,1,1-day,1-one-human-minute,10,10-violent-women,1000013_12_angry_men,10000292-rat,...,zubaan,zulfiqar,zulu,zulu_2013,zulu_dawn,zus_and_zo_2003,zvenigora,zwei_mutter_2013,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,zz_top_that_little_ol_band_from_texas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$5_a_day,1.0,0.641877,0.547112,0.056411,0.013602,0.763168,-0.001617,0.079856,-0.029479,-0.253461,...,-0.02674,0.138725,0.019342,0.270372,-0.233868,-0.16856,0.152319,-0.267018,-0.359104,-0.401164
009_re_cyborg,0.641877,1.0,0.2888,-0.476333,-0.30606,0.751309,0.264415,0.199229,-0.197121,-0.040566,...,0.246844,0.315462,0.256081,-0.350755,-0.393802,0.02599,0.766933,-0.402745,-0.110268,-0.134777
00_mhz,0.547112,0.2888,1.0,0.131922,0.067084,0.096136,0.03327,-0.195895,0.006193,-0.083592,...,-0.010192,0.289897,0.048412,0.342555,-0.187471,0.043445,0.007051,-0.222561,-0.360595,-0.097555
1,0.056411,-0.476333,0.131922,1.0,0.167859,-0.435177,0.023273,-0.466805,-0.011736,-0.563864,...,-0.123729,-0.5002,0.012004,0.855843,0.583254,-0.471943,-0.352946,0.601425,-0.158922,-0.320536
1-day,0.013602,-0.30606,0.067084,0.167859,1.0,-0.164369,0.441742,0.254976,0.474941,0.490817,...,-0.077862,0.147753,0.00063,0.450021,-0.283251,0.090264,-0.219338,-0.324408,-0.533047,-0.04578


In [32]:
import dask.dataframe as dd

# Número de partes para dividir o DataFrame
num_parts = 10

# Tamanho de cada parte
part_size = len(sim_df) // num_parts

# Salvar cada parte individualmente com Dask
for i in range(num_parts):
    start = i * part_size
    end = (i + 1) * part_size if i < num_parts - 1 else len(sim_df)
    part = sim_df.iloc[start:end]
    
    # Converter a parte para Dask
    dask_part = dd.from_pandas(part, npartitions=1)
    
    # Salvar a parte em Parquet
    dask_part.to_parquet(f"similarity_matrix_part_{i}.parquet", engine='pyarrow', compression='snappy')
    print(f"Part {i} saved with Dask and Snappy compression.")

Part 0 saved with Dask and Snappy compression.
Part 1 saved with Dask and Snappy compression.
Part 2 saved with Dask and Snappy compression.
Part 3 saved with Dask and Snappy compression.
Part 4 saved with Dask and Snappy compression.
Part 5 saved with Dask and Snappy compression.
Part 6 saved with Dask and Snappy compression.
Part 7 saved with Dask and Snappy compression.
Part 8 saved with Dask and Snappy compression.
Part 9 saved with Dask and Snappy compression.


In [None]:
# ver como carregar esse df (concat?)
# caminho_parquet = "/Users/danielebelmiro/Data Analytics Bootcamp/Rotten/similarity_matrix.parquet"

# Ler o arquivo Parquet
# df = pd.read_parquet(caminho_parquet)

#### delete duplicated normalized titles:

In [33]:
# Step 1: Group by 'id' and create the 'count' column
# Group by 'id' and count occurrences in the reviews table
reviews_count = reviews['id'].value_counts().reset_index()
reviews_count.columns = ['id', 'count']

# Add the 'count' column to the movies DataFrame
movies = movies.merge(reviews_count, on='id', how='left')

# Fill NaN values with 0 (movies with no reviews)
movies['count'] = movies['count'].fillna(0)

# Step 2: Check for duplicate titles
# Create a DataFrame with one row per movie (using the first occurrence of each 'id')
movies_unique = movies.drop_duplicates(subset=['id'], keep='first')

# Check for duplicate normalized titles
duplicates = movies_unique[movies_unique['title_normalized'].duplicated(keep=False)]
print("Duplicate titles (considering one entry per movie):")
print(duplicates[['id', 'title_normalized']].sort_values(by='title_normalized'))

# Step 3: Remove movies with fewer occurrences
# Sort movies by normalized title and review count
movies_sorted = movies.sort_values(by=['title_normalized', 'count'], ascending=[True, False])

# Keep only the first occurrence of each normalized title (the one with the most reviews)
movies_no_duplicates = movies_sorted.drop_duplicates(subset=['title_normalized'], keep='first')

# Check if there are still duplicates
if movies_no_duplicates['title_normalized'].duplicated().any():
    print("There are still duplicate titles after removal.")
else:
    print("All duplicate titles have been removed.")

# Display the final DataFrame
print("\nFinal DataFrame without duplicates:")
print(movies_no_duplicates)

Duplicate titles (considering one entry per movie):
                        id title_normalized
24483               1_2013                1
4157    1-one-human-minute                1
103248              1_2017                1
1495                     1                1
78594                  ten               10
...                    ...              ...
91334             zoo_2018              zoo
23518            zoom_2016             zoom
82844            zoom_2006             zoom
63636                 zulu             zulu
82326            zulu_2013             zulu

[9566 rows x 2 columns]
All duplicate titles have been removed.

Final DataFrame without duplicates:
                                                                  id  \
76847                                           my_missing_valentine   
83402                                                         00_mhz   
25704                                                  009_re_cyborg   
34770                         

In [34]:
movies = movies_no_duplicates

In [35]:
movies.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,emotions,title_normalized,count
76847,my_missing_valentine,消失的情人節,55.67,65.76,119.0,Comedy,Chinese,Yu-Hsun Chen,"Yu-Hsun Chen,Yu-Hsun Chen",Unknown,"[('admiration', 52.16256606223262), ('excitement', 24.98463494276986), ('joy', 22.852798994997528)]",,3
83402,00_mhz,0.0 MHz,33.0,65.76,101.0,Horror,Korean,Yoo Sun-Dong,Jang Jak,2020,"[('annoyance', 41.840924718059824), ('admiration', 36.49896509094363), ('joy', 21.660110190996555)]",00 mhz,3
25704,009_re_cyborg,009 Re: Cyborg,43.0,23.0,103.0,Action,Japanese,Kenji Kamiyama,Kenji Kamiyama,2015,"[('disapproval', 36.023279538211014), ('annoyance', 32.165396240044096), ('confusion', 31.81132422174489)]",009 re cyborg,13
34770,45,0.45,38.0,65.76,101.0,Crime,English,Gary Lennon,Gary Lennon,2020,"[('annoyance', 80.9424), ('anger', 14.1899), ('approval', 4.8677)]",045,1
103248,1_2017,1%,82.0,47.0,88.0,Unknown,Unknown,Stephen McCallum,Unknown,Unknown,"[('admiration', 40.5276835242493), ('disappointment', 30.096030871256218), ('approval', 29.376285604494484)]",1,17


In [36]:
movies.shape

(61999, 13)

In [None]:
# validations of the similarity calculation:

In [37]:
print(reviews_reduced)

NameError: name 'reviews_reduced' is not defined

In [40]:
# Colunas de interesse
colunas = ['admiration', 'amusement', 'anger']

# Condição: Verifica se os valores nas colunas são maiores que zero
condicao = (reviews_wide[colunas] == 0)

# Filtra as linhas onde pelo menos 3 colunas atendem à condição
linhas_filtradas = reviews_wide[condicao.sum(axis=1) >= 3]

# Exibir o resultado
print(linhas_filtradas)

emotion                                                       admiration  \
id                                                                         
009_re_cyborg                                                        0.0   
1-one-human-minute                                                   0.0   
10-violent-women                                                     0.0   
10000390-mickey                                                      0.0   
10000594-guardian                                                    0.0   
...                                                                  ...   
zpg                                                                  0.0   
zu_warriors                                                          0.0   
zulfiqar                                                             0.0   
zvenigora                                                            0.0   
zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000         0.0   

emotion    

In [39]:
# veja a matriz e compare a similaridade entre filmes que você sabe que são semelhantes ou 
# diferentes com base nas porcentagens das emoções
# se dois filmes têm porcentagens muito parecidas para as mesmas emoções, a similaridade deve ser alta

# Filmes com porcentagens de emoções parecidas
filme_a = reviews_wide.loc['katherine_ryan_glitter_room']
filme_b = reviews_wide.loc['10008742-my_sister_eileen']

# Verificar similaridade na matriz
similaridade = sim_df.loc['katherine_ryan_glitter_room', '10008742-my_sister_eileen']
print(f"Similaridade: {similaridade}")

Similaridade: 0.9991096338846843


In [41]:
# Filmes com porcentagens de emoções diferentes
filme_a = reviews_wide.loc['katherine_ryan_glitter_room']
filme_b = reviews_wide.loc['009_re_cyborg']

# Verificar similaridade na matriz
similaridade = sim_df.loc['katherine_ryan_glitter_room', '009_re_cyborg']
print(f"Similaridade: {similaridade}")

Similaridade: -0.10828669616217894


In [47]:
# Aplicar PCA para reduzir a dimensionalidade
pca = PCA(n_components=0.97)  # Manter 97% da variância
reviews_reduced = pca.fit_transform(reviews_wide.values)

# Escolha dois filmes para comparar
filme1_id = '$5_a_day'  # Substitua pelo ID do primeiro filme
filme2_id = '009_re_cyborg'  # Substitua pelo ID do segundo filme

# Encontre os índices dos filmes na matriz reviews_wide
filme1_idx = reviews_wide.index.get_loc(filme1_id)
filme2_idx = reviews_wide.index.get_loc(filme2_id)

# Selecionar os vetores dos filmes após o PCA
filme1_vector = reviews_reduced[filme1_idx].reshape(1, -1)  # Transforma em formato (1, n_features)
filme2_vector = reviews_reduced[filme2_idx].reshape(1, -1)  # Transforma em formato (1, n_features)

# Calcular a similaridade do cosseno manualmente
similaridade_manual = cosine_similarity(filme1_vector, filme2_vector)[0][0]

# Obter o valor correspondente na matriz sim_df
similaridade_matriz = sim_df.loc[filme1_id, filme2_id]

# Exibir os resultados
print(f"Similaridade manual entre {filme1_id} e {filme2_id}: {similaridade_manual}")
print(f"Similaridade na matriz sim_df: {similaridade_matriz}")

# Verificar se os valores são iguais (ou muito próximos, devido a arredondamentos)
if np.isclose(similaridade_manual, similaridade_matriz, atol=1e-6):
    print("Os valores são iguais (dentro de uma tolerância pequena).")
else:
    print("Os valores são diferentes.")
    
# compare filmes com emoções muito parecidas e filmes com emoções muito diferentes

Similaridade manual entre $5_a_day e 009_re_cyborg: 0.6418774100472826
Similaridade na matriz sim_df: 0.6418774100472827
Os valores são iguais (dentro de uma tolerância pequena).


## Main recommendation function

In [49]:
# Main recommendation function
def recommend_similar_movies(sim_df, movies, reviews, top_n=5):
    
    while True:
        # Ask for the user's favorite movie
        favorite_movie = input("Enter the name of your favorite movie (or type 'exit' to quit): ").strip()

        # Allow the user to exit
        if favorite_movie.lower() == 'exit':
            print("Exiting the recommendation system. Goodbye!")
            return None

        # Find the closest matching movie in the similarity DataFrame
        favorite_movie_title = find_closest_movie(favorite_movie, sim_df)

        if favorite_movie_title is None:
            print(f"The movie '{favorite_movie}' was not found. Please check the name and try again.")
            continue  # Ask for input again
        else:
            print(f"Found movie: {favorite_movie_title}")
            break  # Exit the loop if a valid movie is found
            
            
    # Encontrar o ID do filme no DataFrame movies
    favorite_movie_id = movies.loc[movies['title_normalized'] == favorite_movie_title, 'id'].values[0]

    # Verificar se o ID foi encontrado
    if favorite_movie_id:
        print(f"ID do filme '{favorite_movie_title}': {favorite_movie_id}")

        # Acessar as similaridades na matriz sim_df
        if favorite_movie_id in sim_df.columns:
            movie_similarities = sim_df[favorite_movie_id]
            print("Similaridades encontradas:")
            print(movie_similarities)
        else:
            print(f"ID '{favorite_movie_id}' não encontrado na matriz de similaridade.")
    else:
        print(f"Filme '{favorite_movie_title}' não encontrado.")

    # Get the similarities for the favorite movie
    movie_similarities = sim_df[favorite_movie_title]

    # Sort movies by similarity (from most to least similar)
    movie_similarities = movie_similarities.sort_values(ascending=False)

    # Remove the favorite movie itself from the recommendations
    if favorite_movie_title in movie_similarities.index:
        movie_similarities = movie_similarities.drop(favorite_movie_title)

    # Filter out movies with negative reviews
    # 1. Calculate the ratio of negative reviews for each movie
    negative_reviews_ratio = reviews[reviews['scoreSentiment'] == 'NEGATIVE'].groupby('id').size() / reviews.groupby('id').size()
    movies_with_high_negative_reviews = negative_reviews_ratio[negative_reviews_ratio > 0.5].index

    # 2. Filter movies based on tomatoMeter < 30, audienceScore < 30, and high negative reviews
    filtered_movies = movies[
        (movies['tomatoMeter'] < 30) & 
        (movies['audienceScore'] < 30) & 
        (movies['id'].isin(movies_with_high_negative_reviews))
    ]

    # Get the IDs of movies to exclude
    movies_to_exclude = filtered_movies['id'].unique()

    # Exclude these movies from the recommendations
    movie_similarities = movie_similarities[~movie_similarities.index.isin(movies_to_exclude)]

    # Get the genres of the favorite movie
    favorite_movie_id = movies[movies['title_normalized'] == favorite_movie_title]['id'].values[0]
    favorite_movie_genres = movies[movies['id'] == favorite_movie_id]['genre'].unique()

    # Filter movies by genre (only recommend movies that share at least one genre with the favorite movie)
    movies_in_same_genre = movies[movies['genre'].isin(favorite_movie_genres)]['title_normalized'].unique()
    movie_similarities = movie_similarities[movie_similarities.index.isin(movies_in_same_genre)]

    # Check if there are enough movies to recommend
    if len(movie_similarities) < top_n:
        print(f"Warning: Only {len(movie_similarities)} similar movies were found.")

    # Select the top_n most similar movies
    recommended_movies = movie_similarities.head(top_n).index.tolist()

    # Fetch additional information from the movies DataFrame
    movie_details = movies[movies['title_normalized'].isin(recommended_movies)]

    # Combine similarity information with movie details
    similarity_df = movie_similarities.head(top_n).reset_index()

    # Check the name of the index column
    if 'index' in similarity_df.columns:
        similarity_df = similarity_df.rename(columns={'index': 'title_normalized'})
    else:
        # If the index has a different name, rename it to 'title_normalized'
        similarity_df = similarity_df.rename(columns={similarity_df.columns[0]: 'title_normalized'})

    # Rename the similarity column to 'similarity'
    similarity_df = similarity_df.rename(columns={similarity_df.columns[1]: 'similarity'})

    # Merge with the correct column
    result = movie_details.merge(
        similarity_df,
        left_on='title_normalized',
        right_on='title_normalized'
    )

    # Add predicted_moods to the result
    # Group reviews by movie ID and combine predicted_moods
    predicted_moods = reviews.groupby('id')['predicted_moods'].first().reset_index()
    result = result.merge(predicted_moods, on='id', how='left')
    
    # Group movies by ID and combine information
    result = result.groupby('id').agg({
        'title': 'first',  
        'director': 'first',  
        'originalLanguage': 'first',  
        'runtimeMinutes': 'first',  
        'genre': lambda x: ', '.join(set(x.dropna())),  # Combina gêneros únicos
        'release_year': 'first',  
        'tomatoMeter': 'first',  
        'audienceScore': 'first',  
        'similarity': 'first',  
        'predicted_moods': 'first'
    }).reset_index()
    
    
    # Select relevant columns
    result = result[['id', 'title', 'director', 'originalLanguage', 'runtimeMinutes', 'genre', 'release_year', 'tomatoMeter', 'audienceScore', 'similarity', 'predicted_moods']]

    # Format the output for the user
    print(f"\nRecommendations based on the movie '{favorite_movie_title}':\n")
    for _, row in result.iterrows():
        print(f"Movie: {row['title']}")
        print(f"  - Director: {row['director']}")
        print(f"  - Language: {row['originalLanguage']}")
        print(f"  - Duration: {row['runtimeMinutes']} minutes")
        print(f"  - Genre: {row['genre']}")
        print(f"  - Year: {row['release_year']}")
        print(f"  - Tomatometer: {row['tomatoMeter']}%")
        print(f"  - Audience Score: {row['audienceScore']}%")
        print(f"  - Similarity: {row['similarity']:.2f}")
        if isinstance(row['predicted_moods'], list):
            print(f"  - Predicted Moods: {', '.join([f'{mood} ({percentage:.1f}%)' for mood, percentage in row['predicted_moods']])}")
        else:
            print(f"  - Predicted Moods: None")
        print("-" * 40)

    return result


In [51]:
# Generate recommendations
recommendations = recommend_similar_movies(sim_df, movies, reviews, top_n=5)

Enter the name of your favorite movie (or type 'exit' to quit): inside out
Found movie: insideout
ID do filme 'insideout': insideout
Similaridades encontradas:
id
$5_a_day                                                        0.232632
009_re_cyborg                                                  -0.382605
00_mhz                                                          0.307377
1                                                               0.897107
1-day                                                           0.399338
                                                                  ...   
zus_and_zo_2003                                                -0.379231
zvenigora                                                      -0.283700
zwei_mutter_2013                                                0.287158
zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000   -0.378392
zz_top_that_little_ol_band_from_texas                          -0.460919
Name: insideout, Length: 67604, dt

In [52]:
recommendations

Unnamed: 0,id,title,director,originalLanguage,runtimeMinutes,genre,release_year,tomatoMeter,audienceScore,similarity,predicted_moods
0,inam,Inam,Santosh Sivan,Tamil,123.0,Drama,Unknown,65.76,55.67,0.996549,"[(admiration, 78.57620137642752), (approval, 11.806161056816656), (gratitude, 1.7309782596087429)]"
1,kairos,Kairos,Paul Barakat,English,94.0,Drama,Unknown,65.76,55.67,0.999969,"[(admiration, 66.4177409573322), (approval, 12.7135733530459), (gratitude, 4.900235112249754)]"
2,metal,Metal,Christopher E Brown,English,89.0,Drama,Unknown,65.76,55.67,0.996778,"[(admiration, 69.98463226478196), (approval, 17.6018024305297), (gratitude, 3.4312106698524083)]"
3,piagol,Piagol,Kang-cheon Lee,Korean,110.0,Drama,Unknown,65.76,55.67,0.997694,"[(admiration, 73.598732443837), (approval, 16.415251742487925), (gratitude, 2.7057060043891)]"
4,wasp_2014,Wasp,Philippe Audi-Dor,English,77.333333,Drama,2016,65.76,24.0,0.996543,"[(approval, 54.91740651904544), (admiration, 20.819422871558913), (joy, 7.638680666501813)]"


In [20]:
sim_df.head()

id,five dollars a day,009 re cyborg,00 mhz,1,1 day,1,10,10 violent women,12 angry men,rat,...,zubaan,zulfiqar,zulu,zulu,zulu dawn,hotel paraiso,zvenigora,two mothers,zycie jako smiertelna choroba przenoszona droga plciowa,zz top that little ol band from texas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
five dollars a day,1.0,0.64191,0.547113,0.056409,0.013593,0.763169,-0.001634,0.079867,-0.029517,-0.253461,...,-0.026758,0.138734,0.019327,0.270363,-0.233877,-0.168894,0.15233,-0.267019,-0.359125,-0.401159
009 re cyborg,0.64191,1.0,0.288818,-0.476281,-0.306145,0.751342,0.264273,0.199067,-0.197512,-0.040599,...,0.24671,0.315416,0.255904,-0.350793,-0.393885,0.025428,0.766908,-0.40277,-0.110379,-0.134771
00 mhz,0.547113,0.288818,1.0,0.131923,0.067086,0.096132,0.03328,-0.195939,0.006196,-0.083593,...,-0.010192,0.289912,0.048438,0.342555,-0.187472,0.043531,0.007051,-0.222559,-0.360611,-0.097552
1,0.056409,-0.476281,0.131923,1.0,0.16792,-0.435195,0.023473,-0.466734,-0.011479,-0.563865,...,-0.123584,-0.500167,0.012297,0.855898,0.583346,-0.472233,-0.352854,0.601463,-0.158839,-0.320561
1 day,0.013593,-0.306145,0.067086,0.16792,1.0,-0.164375,0.441717,0.254898,0.474965,0.490813,...,-0.077994,0.147707,0.000432,0.450016,-0.283299,0.090054,-0.219475,-0.324417,-0.533145,-0.045773


In [17]:
# Filtrar filmes com predicted_moods igual a None
movies_without_moods = recommendations[recommendations['predicted_moods'].isna()]

# Exibir os filmes filtrados
if not movies_without_moods.empty:
    print("Filmes sem predicted_moods:")
    print(movies_without_moods[['id', 'title', 'predicted_moods']])
else:
    print("Nenhum filme sem predicted_moods encontrado.")

Filmes sem predicted_moods:
          id title predicted_moods
0  771459240  Five            None
2  five_2011  Five            None
4  five_2020  Five            None


In [28]:
movies.loc[movies['title_normalized'] == "five"]

Unnamed: 0,id,title,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,writer,release_year,title_normalized
6754,five,Five,52.0,73.0,76.166667,Sci-fi,English,Arch Oboler,Unknown,2009,five
16666,five_2020,Five,55.67,65.76,76.166667,Drama,English,Jeffrey DeChausse,Jeffrey DeChausse,Unknown,five
53983,five_2011,Five,75.0,65.76,76.166667,Comedy,English,"Jennifer Aniston,Alicia Keys,Demi Moore,Patty Jenkins,Penelope Spheeris",Unknown,2012,five
53984,five_2011,Five,75.0,65.76,76.166667,Drama,English,"Jennifer Aniston,Alicia Keys,Demi Moore,Patty Jenkins,Penelope Spheeris",Unknown,2012,five
66026,771459240,Five,55.67,65.76,76.166667,Documentary,Unknown,Abbas Kiarostami,Abbas Kiarostami,Unknown,five
84031,five_2016,Five,63.0,65.76,76.166667,Drama,French,Igor Gotesman,Igor Gotesman,Unknown,five
