In [39]:
#Sentiment analysis on expert textual reviews to get an “expert likeability-score”
#Correlation analysis (statistical tests) between gross sales, likeability score, and expert ratings

#Clustering of plot keywords amongst plots (pre-filtering using a standards tokenization pipileine 
#(normalization, postword removal, stemming, and, in our case, removing verbs))

#Clustering of review keywords for different expert textual reviews/consensus on “qualitative” movies.



To perform our quality analysis we extend our initial dataset with several "Rotten Tomatoes" dataset, in order to access useful indicators of popularity. The useful indicators we want to study now are : the box-office, the expert critic reviews, the expert critic score, the movie rewards.
By merging multiple datasets and observe their size and content we will discuss the feasability of the next operations, for example statistical analysis.


In [40]:
import os
from copy import deepcopy
from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import ttest_ind, chi2_contingency

DATA_DIR = "../data/raw/"
INIT_DATABASE = DATA_DIR + "movie.metadata.tsv"
TOP_MOVIES_RT = DATA_DIR + "rotten_tomatoes_top_movies.csv"
OSCARS = DATA_DIR + "the_oscar_award.csv"


In [41]:
# Correlation analysis
# create dataset with box-office, number of nominations, critic score, comedy

df_init_db = pd.read_csv(INIT_DATABASE, sep='\t')
df_raw_rt_top = pd.read_csv(TOP_MOVIES_RT)
df_oscars = pd.read_csv(OSCARS)

print(len(df_init_db))


81740


In [None]:
#selection of the useful columns
column_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Box office',
    'Movie runtime',
    'Movie languages',
    'Movie countries',
    'Genres'
]

df_init_db.columns = column_names
useful_col = ['Movie name','Box office','Genres']
unfiltered_df_box_office = df_init_db[useful_col]
df_revenue = unfiltered_df_box_office[unfiltered_df_box_office['Genres'].str.contains('comedy', case=False, na=False)]
print(len(df_revenue))
count_na = df_revenue['Box office'].isna().sum()
print(f"Nbr of Nan in box office: {count_na}")
#df_box_office = df_revenue[df_revenue["Box office"].notna()].copy()
df_box_office = df_revenue.copy()
print(count_na/len(df_box_office))



21696
Nbr of Nan in box office: 18209


In the initial dataset the number of missing box-office values is very high, about 84% of the movies are missing a box-office value.

In [None]:
df_raw_rt_top = df_raw_rt_top.rename(columns={"title": "Movie name"})
df_rt_top = df_raw_rt_top[["Movie name", "critic_score"]]
print(f"Rotten tomatoes dataset size : {len(df_rt_top)}")


Rotten tomatoes dataset size : 1610
388


In [None]:
df_oscars.head()
df_oscars = df_oscars.rename(columns={"film": "Movie name"})
df_nominations = df_oscars["Movie name"].value_counts()
df_nominations.head()


Movie name
A Star Is Born          25
West Side Story         18
Titanic                 16
Moulin Rouge            15
Mutiny on the Bounty    15
Name: count, dtype: int64

In [None]:
#Merge datafames
merged_dfs = pd.merge(df_box_office, df_rt_top, on="Movie name", how='inner')
df_scores = merged_dfs.drop_duplicates()
print(len(df_scores))
df_scores.head()

210


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,Mary Poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,Amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,The Lady Vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,My Fair Lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,Don't Look Now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95


In [None]:
df_rewards = pd.merge(df_scores, df_nominations, on="Movie name", how='inner')
df_rewards = df_rewards.rename(columns={'count':'Nominations'})
print(len(df_rewards))

In [61]:
test_merge = pd.merge(unfiltered_df_box_office,df_rt_top, on="Movie name", how="inner")
test_merge = test_merge.drop_duplicates()
print(len(test_merge))
test_merge.head()

881


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,Henry V,10161099.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",100
2,Mary Poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
7,The Battle of Algiers,,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/03q4nz"": ""Wo...",99
9,Sweeney Todd: The Demon Barber of Fleet Street,152523164.0,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/0fx2s"": ""Tra...",86
10,Amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95


In [102]:
#standardize movie names to verify the accuracy of the merge 
import re

def standardize_title(title):
    if not isinstance(title, str):
        return None
    title = title.lower()
    title = re.sub(r'[^a-z0-9\s]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

df_box_office['Movie name'] = df_box_office['Movie name'].apply(standardize_title)
df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)

std_titles = pd.merge(df_box_office, df_rt_top, on='Movie name', how='inner')
std_df = std_titles.drop_duplicates()
print(len(std_df))
std_df.head()

# we can conclude that the matter of the final dataframe size is not related to the strandardization of the movie titles

221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,the lady vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,my fair lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,dont look now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95


In this 1st part, the first merge between our initial dataset and the top movies of Rotten tomatoes lead to a very small, hence not usable dataset of 221 columns. This leads to standardize the titles to see if the merge is the problem, this did not display some major improvements. The conclusion is that the Rotten tomators Top Movies dataset is not big enough to show some interesting results when we only take comedies.
Let us try with some bigger datasets : 

In [108]:
RAW_RT_REVIEWS = "raw_rotten_tomatoes_movie_reviews.csv"
RAW_RT = "rotten_tomatoes_movies.csv"

CLEAN_RT_REVIEWS = "clean_rotten_tomatoes_critic.csv"


df_raw_rt_rev = pd.read_csv(RAW_RT_REVIEWS)
df_raw_rt = pd.read_csv(RAW_RT)

df_raw_rt_rev = df_raw_rt_rev.rename(columns = {"isTopCritic": "Bool_critic", "originalScore": "Note"})
df_short_rt = df_raw_rt_rev[["id","Bool_critic","Note","scoreSentiment"]]
df_short_rt.to_csv(CLEAN_RT_REVIEWS)

df_id_title = df_raw_rt[["id","title"]]

merged_raws = pd.merge(df_id_title, df_short_rt, on="id", how="inner")
merged_raws = merged_raws.rename(columns = {"title": "Movie name"})
merged_raws["Movie name"] = merged_raws["Movie name"].apply(standardize_title)

print(len(merged_raws))
merged_raws.head()



1469543


Unnamed: 0,id,Movie name,Bool_critic,Note,scoreSentiment
0,love_lies,love lies,False,4/5,POSITIVE
1,love_lies,love lies,False,3.5/5,POSITIVE
2,adrift_2018,adrift,False,6/10,POSITIVE
3,adrift_2018,adrift,False,,POSITIVE
4,adrift_2018,adrift,False,C+,NEGATIVE


In [None]:
merge_big_dfs = pd.merge(df_box_office, merged_raws , on='Movie name', how='inner')
#print(list(set(df_box_office["Movie name"].unique().tolist()).difference(merge_big_dfs["Movie name"].unique().tolist()))[5:10])
merge_big_dfs = merge_big_dfs.drop_duplicates()
merge_big_dfs = merge_big_dfs[merge_big_dfs["Note"].notna()].copy()
df_expert_critics = merge_big_dfs[merge_big_dfs["Bool_critic"]==True]
print(len(df_expert_critics))

list_movies = df_expert_critics["Movie name"].unique()
print(len(list_movies))

df_expert_critics.head()

36797
5340


Unnamed: 0,Movie name,Box office,Genres,id,Bool_critic,Note,scoreSentiment
18,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",mary_poppins,True,4/5,POSITIVE
20,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",mary_poppins,True,4.5/5,POSITIVE
24,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",mary_poppins,True,3/4,POSITIVE
57,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",mary_poppins,True,5/5,POSITIVE
60,white on rice,,"{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""...",1218013-white_on_rice,True,2.5/5,NEGATIVE


The merge of the initial dataset with the second Rotten tomatoes one gives a more exploitable dataset : 5340 comedies with critic score and reviews. There are still some missing values (=NaN) on the box office column from the initial dataset, thses missing values will be completed with another dataset.

In [None]:
#The critic scores have to be standardized so that a mean could be computed later

def standardize_score(note):
    if isinstance(note, str) and '/' in note:
        try:
            numerator, denominator = map(float, note.split('/'))
            return numerator / denominator
        except ValueError:
            return None
    else:
        return None

df_expert_critics["Note"] = df_expert_critics["Note"].apply(standardize_score)
df_expert_critics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_expert_critics["Note"] = df_expert_critics["Note"].apply(standardize_score)


Unnamed: 0,id,reviewId,creationDate,criticName,Bool_critic,Note,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl,Movie name,Box office,Genres
14,10,1331345,2004-10-23,Roger Ebert,True,1.0,fresh,Chicago Sun-Times,Blake Edwards's 10 is perhaps the first comedy...,POSITIVE,http://www.rogerebert.com/reviews/10-1979,10,74865517.0,"{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""..."
28,99,2553588,2019-02-01,Namrata Joshi,True,0.5,rotten,Outlook,"The film sparkles only in a few places, tends ...",NEGATIVE,https://www.outlookindia.com/magazine/story/99...,99,,"{""/m/01chg"": ""Bollywood"", ""/m/04t36"": ""Musical..."
40,1941,1388591,2005-05-09,Vincent Canby,True,0.3,rotten,New York Times,It may possibly be that Mr. Spielberg has chos...,NEGATIVE,http://movies.nytimes.com/movie/review?res=9A0...,1941,,"{""/m/0gf28"": ""Parody"", ""/m/06qm3"": ""Screwball ..."
55,1941,70,2000-01-01,Ian Freer,True,0.6,fresh,Empire Magazine,"Unworkable farce, it maybe, but it is also ble...",POSITIVE,http://www.empireonline.co.uk/reviews/search.a...,1941,,"{""/m/0gf28"": ""Parody"", ""/m/06qm3"": ""Screwball ..."
60,twelve,2027592,2011-11-18,Ben Kenigsberg,True,0.4,rotten,Time Out,,NEGATIVE,,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
61,twelve,2025416,2011-11-17,Joshua Rothkopf,True,0.6,fresh,Time Out,,POSITIVE,,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
74,twelve,1817703,2009-04-30,Calvin Wilson,True,1.0,fresh,St. Louis Post-Dispatch,Those who seek a drama as provocative as it is...,POSITIVE,,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
76,twelve,1814364,2009-04-22,Roger Moore,True,0.8,fresh,Orlando Sentinel,"A fascinating, stylish commentary on Russian j...",POSITIVE,http://www.orlandosentinel.com/entertainment/m...,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
80,twelve,1812210,2009-04-10,Steven Rea,True,0.75,fresh,Philadelphia Inquirer,"Full of passion and speechifying, 12 is unmist...",POSITIVE,http://www.philly.com/inquirer/columnists/stev...,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."
81,twelve,1812027,2009-04-09,Marc Savlov,True,0.8,fresh,Austin Chronicle,12 is every bit as much of a moral powerhouse ...,POSITIVE,http://www.austinchronicle.com/gyrobase/Calend...,12,,"{""/m/05p553"": ""Comedy film"", ""/m/04t36"": ""Musi..."


In [112]:
PLOT = DATA_DIR + "plot_summaries.txt"

with open(PLOT, 'r') as file:
    content = file.readlines()[:10]

for line in content:
    print(line.strip())



23890098	Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.
31186339	The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl  between the ages of 12 and 18 selected by lottery  for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academies and alm

In [126]:
sample_path = 'sample_file.csv'

df_plot = pd.read_csv(PLOT, delimiter='\t', header=None)
df_plot.columns = ['ID', 'Plot']
df_plot.head(10)
df_plot.head(50).to_csv(sample_path)
print(len(df_plot))




42303


In [None]:
!pip install tiktoken nltk scikit-learn

In [None]:
import nltk
import tiktoken

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 
from sklearn.cluster import KMeans 
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords

nltk.download('punkt_tab') 
nltk.download('averaged_perceptron_tagger_eng')




[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\theol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\theol\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
#Steps of text treatment

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)
    words = nltk.word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

#reduce words to their root form
def stem_text(text):
    stemmer = PorterStemmer()
    words = nltk.word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

def remove_verbs(text):
    words = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    non_verbs = [word for word, pos in tagged_words if pos not in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    return ' '.join(non_verbs)

def tokenize_text(text): 
    enc = tiktoken.encoding_for_model('gpt-4o')
    tokens = enc.encode(str(text))
    return tokens



In [None]:
df_sample_plot = pd.read_csv(sample_path)

df_sample_plot['normalized'] = df_sample_plot['Plot'].apply(normalize_text)
df_sample_plot['no_stopwords'] = df_sample_plot['normalized'].apply(remove_stopwords)
df_sample_plot['stemmed'] = df_sample_plot['no_stopwords'].apply(stem_text)
df_sample_plot['non_verbs'] = df_sample_plot['stemmed'].apply(remove_verbs)
df_sample_plot['tokens'] = df_sample_plot['non_verbs'].apply(tokenize_text)

df_tokenized_plots = df_sample_plot[['ID','tokens']]
df_tokenized_plots.head()


Unnamed: 0,ID,tokens
0,23890098,"[1116, 423, 23924, 3479, 1596, 42795, 8931, 62..."
1,31186339,"[170588, 6389, 347, 19859, 72, 41415, 340, 148..."
2,20663735,"[2519, 25066, 34592, 332, 1762, 270, 3860, 103..."
3,2231378,"[282, 3498, 9440, 620, 68932, 274, 3889, 2766,..."
4,595909,"[344, 81238, 1635, 13572, 421, 11967, 51833, 1..."


In [118]:
df_plot['normalized'] = df_plot['Plot'].apply(normalize_text)
df_plot['no_stopwords'] = df_plot['normalized'].apply(remove_stopwords)
df_plot['stemmed'] = df_plot['no_stopwords'].apply(stem_text)
df_plot['non_verbs'] = df_plot['stemmed'].apply(remove_verbs)
df_plot['tokens'] = df_plot['non_verbs'].apply(tokenize_text)

df_tokenized_plots = df_plot[['ID','tokens']]
df_tokenized_plots.head()

KeyboardInterrupt: 

The tokenization of the complete plot summaries file is completed in :  s.

In [120]:
!pip install llmware

Collecting llmware
  Downloading llmware-0.3.9-py3-none-any.whl.metadata (70 kB)
Collecting boto3>=1.24.53 (from llmware)
  Downloading boto3-1.35.62-py3-none-any.whl.metadata (6.7 kB)
Collecting huggingface-hub>=0.19.4 (from llmware)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting pymongo>=4.7.0 (from llmware)
  Downloading pymongo-4.10.1-cp311-cp311-win_amd64.whl.metadata (22 kB)
Collecting tokenizers>=0.15.0 (from llmware)
  Downloading tokenizers-0.20.3-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting psycopg-binary==3.1.17 (from llmware)
  Downloading psycopg_binary-3.1.17-cp311-cp311-win_amd64.whl.metadata (2.9 kB)
Collecting psycopg==3.1.17 (from llmware)
  Downloading psycopg-3.1.17-py3-none-any.whl.metadata (4.2 kB)
Collecting pgvector==0.2.4 (from llmware)
  Downloading pgvector-0.2.4-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting soundfile>=0.12.0 (from llmware)
  Downloading soundfile-0.12.1-py2.py3-none-win_amd64.whl.metadata (14 k

In [None]:
from llmware.agents import LLMfx

df_1st_plot = df_plot.head(5)

def get_one_sentiment_classification(text):
    agent = LLMfx(verbose=True)
    agent.load_tool("sentiment")
    sentiment = agent.sentiment(text)
    
    sentiment_value = sentiment["llm_response"]["sentiment"]
    confidence_level = sentiment["confidence_score"]
    
    print("Plot:", text)
    print("Sentiment:", sentiment_value)
    print("Confidence level", confidence_level)
    
    return sentiment_value, confidence_level

# Application of the analysis on the plots
df_1st_plot[['Sentiment', 'Confidence']] = df_1st_plot['Plot'].apply(
    lambda x: pd.Series(get_one_sentiment_classification(x))
)

print(df_1st_plot.head())



update: Launching LLMfx process
step - 	1 - 	creating object - ready to start processing.
step - 	2 - 	loading tool - sentiment
step - 	3 - 	executing function call - deploying - sentiment 
step - 	4 - 	loading new processing text - 1 new entries


KeyboardInterrupt: 

The computation of the sentiment analysis for the first 5 plots of the dataset takes 2min44s, which indicates that for the entire dataset the computation time will be important.

In [133]:
df_1st_plot = df_plot.head(5)
words = ' '.join(df_1st_plot["Plot"]).split()
tot_words = ' '.join(df_plot["Plot"]).split()
time = 164
tot_time = ((len(tot_words)/len(words))*time)/(24*3600)
print(f"Linear estimation of the computation time for all the plot summaries: {tot_time} days")

Linear estimation of the computation time for all the plot summaries: 9.804180564651096 days
