In [36]:
#Sentiment analysis on expert textual reviews to get an “expert likeability-score”
#Correlation analysis (statistical tests) between gross sales, likeability score, and expert ratings

#Clustering of plot keywords amongst plots (pre-filtering using a standards tokenization pipileine 
#(normalization, postword removal, stemming, and, in our case, removing verbs))

#Clustering of review keywords for different expert textual reviews/consensus on “qualitative” movies.



In [37]:
import os
from copy import deepcopy
from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import ttest_ind, chi2_contingency

DATA_DIR = "../data/"
INIT_DATABASE = DATA_DIR + "movie.metadata.tsv"
TOP_MOVIES_RT = DATA_DIR + "rotten_tomatoes_top_movies.csv"
OSCARS = DATA_DIR + "the_oscar_award.csv"


In [38]:
# Correlation analysis
# create dataset with box-office, number of nominations, critic score, comedy

df_init_db = pd.read_csv(INIT_DATABASE, sep='\t')
df_raw_rt_top = pd.read_csv(TOP_MOVIES_RT)
df_oscars = pd.read_csv(OSCARS)

print(len(df_init_db))


81740


In [39]:
#selection useful columns
column_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Box office',
    'Movie runtime',
    'Movie languages',
    'Movie countries',
    'Genres'
]

df_init_db.columns = column_names
useful_col = ['Movie name','Box office','Genres']
unfiltered_df_box_office = df_init_db[useful_col]
df_revenue = unfiltered_df_box_office[unfiltered_df_box_office['Genres'].str.contains('comedy', case=False, na=False)]
print(len(df_revenue))
count_na = df_revenue['Box office'].isna().sum()
print(count_na)
#df_box_office = df_revenue[df_revenue["Box office"].notna()].copy()
df_box_office = df_revenue.copy()
df_box_office.head()
print(len(df_box_office))




21696
18209
21696


In [40]:
rows = len(df_raw_rt_top)
df_raw_rt_top = df_raw_rt_top.rename(columns={"title": "Movie name"})
df_rt_top = df_raw_rt_top[["Movie name", "critic_score"]]
print(len(df_rt_top))
df_top_2011 = df_raw_rt_top[(df_raw_rt_top['year'] >= 2000) & (df_raw_rt_top['year'] <= 2011)] 
print(len(df_top_2011))
df_top_2011.head()

1610
388


Unnamed: 0.1,Unnamed: 0,Movie name,year,synopsis,critic_score,people_score,consensus,total_reviews,total_ratings,type,...,release_date_(theaters),release_date_(streaming),box_office_(gross_usa),runtime,production_co,sound_mix,aspect_ratio,view_the_collection,crew,link
23,23,Up,2009,"Carl Fredricksen (Ed Asner), a 78-year-old bal...",98,90.0,"An exciting, funny, and poignant adventure, Up...",298,"250,000+",Action & Adventure,...,"May 29, 2009 wide","Nov 21, 2015",,1h 29m,Pixar Animation Studios,,,Pixar,"Ed Asner, Christopher Plummer, Bob Peterson, D...",http://www.rottentomatoes.com/m/up
25,25,The Dark Knight,2008,With the help of allies Lt. Jim Gordon (Gary O...,94,94.0,"Dark, complex, and unforgettable, The Dark Kni...",344,"250,000+",Action & Adventure,...,"Jul 18, 2008 wide","Jun 14, 2010",$2.0M,2h 32m,Syncopy,"DTS, Dolby Digital, SDDS",Scope (2.35:1),Batman,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",http://www.rottentomatoes.com/m/the_dark_knight
31,31,Harry Potter and the Deathly Hallows: Part 2,2011,A clash between good and evil awaits as young ...,96,89.0,"Thrilling, powerfully acted, and visually dazz...",331,"250,000+",Action & Adventure,...,"Jul 15, 2011 wide","Jul 24, 2014",$381.0M,2h 11m,"Warner Bros., Moving Picture Company, Heyday F...","SDDS, Dolby Digital",,Harry Potter,"Daniel Radcliffe, Rupert Grint, Emma Watson, R...",http://www.rottentomatoes.com/m/harry_potter_a...
36,36,The Hurt Locker,2008,"Staff Sgt. William James (Jeremy Renner), Sgt....",97,84.0,"A well-acted, intensely shot, action filled wa...",289,"50,000+",Action & Adventure,...,,"Jul 22, 2015",$15.7M,2h 7m,"Kingsgate Films, Voltage Pictures, First Light...",,,,"Jeremy Renner, Anthony Mackie, Brian Geraghty,...",http://www.rottentomatoes.com/m/the_hurt_locker
38,38,Star Trek,2009,"Aboard the USS Enterprise, the most-sophistica...",94,91.0,Star Trek reignites a classic franchise with a...,354,"250,000+",Action & Adventure,...,"May 7, 2009 wide","Aug 1, 2013",$257.7M,2h 6m,Bad Robot,,,Star Trek,"Chris Pine, Zachary Quinto, Leonard Nimoy, Eri...",http://www.rottentomatoes.com/m/star_trek_11


In [41]:
df_oscars.head()
df_oscars = df_oscars.rename(columns={"film": "Movie name"})
df_nominations = df_oscars["Movie name"].value_counts()
df_nominations.head()



Movie name
A Star Is Born          25
West Side Story         18
Titanic                 16
Moulin Rouge            15
Mutiny on the Bounty    15
Name: count, dtype: int64

In [42]:
#Merge datafames
merged_dfs = pd.merge(df_box_office, df_rt_top, on="Movie name", how='inner')
df_scores = merged_dfs.drop_duplicates()
#df_expert = pd.merge(df_scores, df_nominations, on="Movie name", how='inner')
#df_expert = df_expert.rename(columns={'count':'Nominations'})
print(len(df_scores))
df_scores.head()

210


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,Mary Poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,Amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,The Lady Vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,My Fair Lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,Don't Look Now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95


In [43]:
#standardize movie names to verify the accuracy of the merge 
import re

def standardize_title(title):
    # Convert to lowercase
    title = title.lower()
    # Remove punctuation and special characters
    title = re.sub(r'[^a-z0-9\s]', '', title)
    # Remove extra whitespace
    title = re.sub(r'\s+', ' ', title).strip()
    return title

df_box_office['Movie name'] = df_box_office['Movie name'].apply(standardize_title)
df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)

std_titles = pd.merge(df_box_office, df_rt_top, on='Movie name', how='inner')
std_df = std_titles.drop_duplicates()
print(len(std_df))
std_df.head()

# we can conclude that the matter of the final dataframe size is not related to the strandardization of the movie titles

221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,the lady vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,my fair lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,dont look now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95


In [None]:
PLOT = DATA_DIR + "plot_summaries.txt"

with open(PLOT, 'r') as file:
    content = file.readlines()[:10]

for line in content:
    print(line.strip())



23890098	Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.
31186339	The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl  between the ages of 12 and 18 selected by lottery  for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academies and alm

In [46]:
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans
except ImportError:
    print("Installing necessary packages")
    !pip install nltk scikit-learn

# Download stopwords if not already present
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

import tiktoken
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, word_tokenize

# Initialize objects
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Tokenization function using tiktoken
def tokenize_text(s):
    enc = tiktoken.encoding_for_model('gpt-4')
    tokens = enc.encode(str(s))
    return tokens

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Remove stopwords and verbs, then apply stemming
    filtered_words = []
    for word, pos in pos_tag(words):
        if word not in stop_words and pos != 'VB':  # Remove verbs
            stemmed_word = stemmer.stem(word)  # Stem the word
            filtered_words.append(stemmed_word)
    
    return ' '.join(filtered_words)


from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_texts(texts):
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    # Transform texts to TF-IDF features
    vectors = vectorizer.fit_transform(texts)
    return vectors

from sklearn.cluster import KMeans

def cluster_texts(vectors, num_clusters=5):
    # Initialize K-Means
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    # Fit and predict clusters
    labels = kmeans.fit_predict(vectors)
    return labels



def process_and_cluster(texts, num_clusters=5):
    # Step 1: Preprocess each text
    preprocessed_texts = [preprocess_text(text) for text in texts]
    
    # Step 2: Vectorize preprocessed texts
    vectors = vectorize_texts(preprocessed_texts)
    
    # Step 3: Cluster the vectors
    labels = cluster_texts(vectors, num_clusters)
    
    return labels



Installing necessary packages
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------- ----- 1.3/1.5 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.6 MB/s eta 0:00:00
Downloading scikit_learn-1.5.2-cp311-cp311-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:

NameError: name 'nltk' is not defined