In [3]:
#Sentiment analysis on expert textual reviews to get an “expert likeability-score”
#Correlation analysis (statistical tests) between gross sales, likeability score, and expert ratings

#Clustering of plot keywords amongst plots (pre-filtering using a standards tokenization pipileine 
#(normalization, postword removal, stemming, and, in our case, removing verbs))

#Clustering of review keywords for different expert textual reviews/consensus on “qualitative” movies.



In [117]:
import os
from copy import deepcopy
from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from scipy.stats import ttest_ind, chi2_contingency

INIT_DATABASE = "movie.metadata.tsv"
TOP_MOVIES_RT = "rotten_tomatoes_top_movies.csv"
OSCARS = "the_oscar_award.csv"


In [118]:
# Correlation analysis
# create dataset with box-office, number of nominations, critic score, comedy

df_init_db = pd.read_csv(INIT_DATABASE, sep='\t')
df_raw_rt_top = pd.read_csv(TOP_MOVIES_RT)
df_oscars = pd.read_csv(OSCARS)

print(len(df_init_db))


81740


In [119]:
#selection useful columns
column_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Box office',
    'Movie runtime',
    'Movie languages',
    'Movie countries',
    'Genres'
]

df_init_db.columns = column_names
useful_col = ['Movie name','Box office','Genres']
unfiltered_df_box_office = df_init_db[useful_col]
df_revenue = unfiltered_df_box_office[unfiltered_df_box_office['Genres'].str.contains('comedy', case=False, na=False)]
print(len(df_revenue))
count_na = df_revenue['Box office'].isna().sum()
print(count_na)
#df_box_office = df_revenue[df_revenue["Box office"].notna()].copy()
df_box_office = df_revenue.copy()
df_box_office.head()
print(len(df_box_office))




21696
18209
21696


In [120]:
rows = len(df_raw_rt_top)
df_raw_rt_top = df_raw_rt_top.rename(columns={"title": "Movie name"})
df_rt_top = df_raw_rt_top[["Movie name", "critic_score"]]
print(len(df_rt_top))
df_top_2011 = df_raw_rt_top[(df_raw_rt_top['year'] >= 2000) & (df_raw_rt_top['year'] <= 2011)] 
print(len(df_top_2011))
df_top_2011.head()

1610
388


Unnamed: 0.1,Unnamed: 0,Movie name,year,synopsis,critic_score,people_score,consensus,total_reviews,total_ratings,type,...,release_date_(theaters),release_date_(streaming),box_office_(gross_usa),runtime,production_co,sound_mix,aspect_ratio,view_the_collection,crew,link
23,23,Up,2009,"Carl Fredricksen (Ed Asner), a 78-year-old bal...",98,90.0,"An exciting, funny, and poignant adventure, Up...",298,"250,000+",Action & Adventure,...,"May 29, 2009 wide","Nov 21, 2015",,1h 29m,Pixar Animation Studios,,,Pixar,"Ed Asner, Christopher Plummer, Bob Peterson, D...",http://www.rottentomatoes.com/m/up
25,25,The Dark Knight,2008,With the help of allies Lt. Jim Gordon (Gary O...,94,94.0,"Dark, complex, and unforgettable, The Dark Kni...",344,"250,000+",Action & Adventure,...,"Jul 18, 2008 wide","Jun 14, 2010",$2.0M,2h 32m,Syncopy,"DTS, Dolby Digital, SDDS",Scope (2.35:1),Batman,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",http://www.rottentomatoes.com/m/the_dark_knight
31,31,Harry Potter and the Deathly Hallows: Part 2,2011,A clash between good and evil awaits as young ...,96,89.0,"Thrilling, powerfully acted, and visually dazz...",331,"250,000+",Action & Adventure,...,"Jul 15, 2011 wide","Jul 24, 2014",$381.0M,2h 11m,"Warner Bros., Moving Picture Company, Heyday F...","SDDS, Dolby Digital",,Harry Potter,"Daniel Radcliffe, Rupert Grint, Emma Watson, R...",http://www.rottentomatoes.com/m/harry_potter_a...
36,36,The Hurt Locker,2008,"Staff Sgt. William James (Jeremy Renner), Sgt....",97,84.0,"A well-acted, intensely shot, action filled wa...",289,"50,000+",Action & Adventure,...,,"Jul 22, 2015",$15.7M,2h 7m,"Kingsgate Films, Voltage Pictures, First Light...",,,,"Jeremy Renner, Anthony Mackie, Brian Geraghty,...",http://www.rottentomatoes.com/m/the_hurt_locker
38,38,Star Trek,2009,"Aboard the USS Enterprise, the most-sophistica...",94,91.0,Star Trek reignites a classic franchise with a...,354,"250,000+",Action & Adventure,...,"May 7, 2009 wide","Aug 1, 2013",$257.7M,2h 6m,Bad Robot,,,Star Trek,"Chris Pine, Zachary Quinto, Leonard Nimoy, Eri...",http://www.rottentomatoes.com/m/star_trek_11


In [121]:
df_oscars.head()
df_oscars = df_oscars.rename(columns={"film": "Movie name"})
df_nominations = df_oscars["Movie name"].value_counts()
df_nominations.head()



Movie name
A Star Is Born          25
West Side Story         18
Titanic                 16
Moulin Rouge            15
Mutiny on the Bounty    15
Name: count, dtype: int64

In [122]:
#Merge datafames
merged_dfs = pd.merge(df_box_office, df_rt_top, on="Movie name", how='inner')
df_scores = merged_dfs.drop_duplicates()
#df_expert = pd.merge(df_scores, df_nominations, on="Movie name", how='inner')
#df_expert = df_expert.rename(columns={'count':'Nominations'})
print(len(df_scores))
df_scores.head()

210


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,Mary Poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,Amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,The Lady Vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,My Fair Lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,Don't Look Now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95


In [125]:
#standardize movie names to verify the accuracy of the merge 
import re

def standardize_title(title):
    # Convert to lowercase
    title = title.lower()
    # Remove punctuation and special characters
    title = re.sub(r'[^a-z0-9\s]', '', title)
    # Remove extra whitespace
    title = re.sub(r'\s+', ' ', title).strip()
    return title

df_box_office['Movie name'] = df_box_office['Movie name'].apply(standardize_title)
df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)

std_titles = pd.merge(df_box_office, df_rt_top, on='Movie name', how='inner')
std_df = std_titles.drop_duplicates()
print(len(std_df))
std_df.head()

# we can conclude that the matter of the final dataframe size is not related to the strandardization of the movie titles

221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rt_top['Movie name'] = df_rt_top['Movie name'].apply(standardize_title)


Unnamed: 0,Movie name,Box office,Genres,critic_score
0,mary poppins,102272727.0,"{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",98
5,amy,,"{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie"", ...",95
9,the lady vanishes,,"{""/m/01jfsb"": ""Thriller"", ""/m/09blyk"": ""Psycho...",98
11,my fair lady,72000000.0,"{""/m/04xvh5"": ""Costume drama"", ""/m/0520lz"": ""R...",95
13,dont look now,,"{""/m/02hmvc"": ""Short Film"", ""/m/01z4y"": ""Comed...",95
