In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as st
sns.set_theme(style="whitegrid")

# What is the recipe for the perfect movie in **BOLLYWOOD** vs. in **HOLLYWOOD**?

## Data loading

### CMU dataset

| Column name          | Description                                                                                                                                                                                       |   |   |   |
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---|---|
| wikipedia_movie_id | ID of the movie from wikipedia                                                                                                                                                 |   |   |   |
| freebase_movie_id| ID of the movie from freebas                                                                                                                                            |   |   |   |
| movie_name | Name of the movie                                                                                                                                                |   |   |   |
| movie_release_date  | Date the movie was released                                                                                                                                      |   |   |   |
| movie_box_office_revenue  | Revenue of the movie box office                                                                                                                           
| movie_runtime  | Run time of the movie                                                                                                                                                 |   |   |   |
| movie_languages | Languages of the movie                                                                                                                                                  |   |   |   |
| movie_countries | Countries where the movie were created                                                                                                                                  |   |   |   |
| movie_genres   | Genre of the movie                                                                                                                                              |   |   |   |

The movie data set contains 81741 rows.



In [None]:
data_folder = './data/'

names = ['wikipedia_movie_id','freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 
        'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']

movies_data = pd.read_csv(data_folder + 'movie.metadata.tsv', names = names, sep = '\t', )

movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']]= movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']].applymap(lambda x: str.lower(x))

movies_data.head()

### IMdb datasets (ratings, runtimes, isAdult)

In [None]:
#rating files loading
ratings = pd.read_csv(data_folder + 'title.ratings.tsv.gz', sep='\t', compression='gzip')
titles = pd.read_csv(data_folder + 'title.basics.tsv.gz', sep='\t', compression='gzip')

#removing tv episodes that are not included in CMU dataset
titles=titles[titles['titleType'] != 'tvEpisode']

## Preprocessing

### IMdb and CMU datasets merging

In [None]:
#merging IMdb dataset with title, isAdult and runtimeMinutes with dataset with averageRating on movie ID (tconst)
rates = titles.merge(ratings, how='left', on='tconst')[['averageRating', 'numVotes', 'originalTitle', 'isAdult', 'runtimeMinutes']]

#putting all titles to lower case to match the CMU dataset movie names and dropping column with upper cases
rates['movie_name'] = [ele.lower() for ele in rates['originalTitle'].astype(str)]
rates=rates.drop(columns='originalTitle')

#dropping all rows that have a movie name appearing multiple times so the merging on movie name with the CMU dataset is precise
rates = rates.drop_duplicates('movie_name', keep=False)

#converting runtimeMinutes to float and changing '\\N' to NaN to match the format in CMU dataset
rates['runtimeMinutes']=list(map(lambda x: float(x) if x!='\\N' else None , rates['runtimeMinutes']))

In [None]:
#merging CMU dataset with the IMdb dataset
movies_data_merged = movies_data.merge(rates, how = 'left', on='movie_name')

#completing the NaN values in movie_runtime of the CMU dataset with the available ones of the IMdb dataset.
#we trust more the CMU dataset than the IMdb one so we give priority to the runtime values of the CMU dataset
# to the IMdb ones and use the IMdb only if the value is missing in the CMU.
#finally we drop the column with the IMdb runtimes.
movies_data_merged.loc[movies_data_merged['movie_runtime'].isna(), 'movie_runtime']=movies_data_merged.loc[movies_data_merged['movie_runtime'].isna(), 'runtimeMinutes']
movies_data_merged=movies_data_merged.drop(columns='runtimeMinutes')

In [None]:
#finally creating the datasets with all indian movies and all american movies
indian_movies = movies_data_merged[movies_data_merged['movie_countries'] == '{"/m/03rk0": "india"}'].reset_index()
american_movies = movies_data_merged[movies_data_merged['movie_countries'] == '{"/m/09c7w0": "united states of america"}'].reset_index()

print('There are %d indian films and %d american films in the dataset' % (len(indian_movies), len(american_movies)))

## Missing values

In [None]:
indian_movies.info()

In [None]:
american_movies.info()

We notice that the movie box office revenue if very often unavailable and present in very limited number in both datasets. Thus we probably won't further use it.

The informations merged from the IMdb dataset (averageRating, numVotes and isAdult) are available in almost the half of the dataset which makes them still usable despite an important loss of data points.

## Analysis of ratings

### Some global infos about ratings

In [None]:
print('%.3f percent of indian movies have a rating and %.3f percent of american movies have a rating' %
(100-np.sum(indian_movies['averageRating'].isna())/indian_movies.shape[0]*100, 100-np.sum(american_movies['averageRating'].isna())/american_movies.shape[0]*100))

print('Average rating of indian movies : %.3f \tAverage rating of american movies : %.3f' %
(np.mean(indian_movies['averageRating']), np.mean(american_movies['averageRating'])))

### Distributions of movie ratings

In [None]:
plt.figure(figsize=(14,5))
sns.histplot(x='averageRating', data=american_movies, alpha=0.5, bins=25, label='American')
sns.histplot(x='averageRating', data=indian_movies, alpha=0.5, bins=25, color='orange', label='Indian')
plt.title('Distribution of movies ratings')
plt.legend()
plt.show()

In [None]:
print('P-value for normality test of american ratings : %.3f \tP-value for normality test of indian ratings : %.3f ' 
      % (st.normaltest(american_movies['averageRating'], nan_policy='omit').pvalue, st.normaltest(indian_movies['averageRating'], nan_policy='omit').pvalue))

Indian and american movies get similar distributions of ratings. This will allow us to use more easily the ratings as an unbiased tool to estimate the success of a movie.

### Distribution of number of votes

In [None]:
plt.figure(figsize=(10,5))

#creating adapted bins for log-log plot
hist1, bins1 = np.histogram(indian_movies['numVotes'].dropna(), bins=30)
logbins1 = np.logspace(np.log10(bins1[0]),np.log10(bins1[-1]),len(bins1))
hist2, bins2 = np.histogram(american_movies['numVotes'].dropna(), bins=40)
logbins2 = np.logspace(np.log10(bins2[0]),np.log10(bins2[-1]),len(bins2))

#plotting
plt.hist(american_movies['numVotes'], bins=logbins2, label='American', alpha =.5)
plt.hist(indian_movies['numVotes'], bins=logbins1, label='Indian', alpha=.5)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Number of votes')
plt.ylabel('Frequency')
plt.title('American movies get more votes than Indians')
plt.legend()
plt.show()

### Centering the data around each means for further analysis

In [None]:
#center data on mean
indian_movies.loc[:,'centeredRating']=indian_movies['averageRating'].copy()-np.mean(indian_movies['averageRating'])
american_movies.loc[:,'centeredRating']=american_movies['averageRating'].copy()-np.mean(american_movies['averageRating'])
print('Centered average rating of indian movies : %.3f \tCentered average rating of american movies : %.3f' %
(np.mean(indian_movies['centeredRating']), np.mean(american_movies['centeredRating'])))

## Runtimes analysis

### Some global infos about runtimes

In [None]:
print('%.3f percent of indian movies have a runtime and %.3f percent of american movies have a runtime' %
(100-np.sum(indian_movies['movie_runtime'].isna())/indian_movies.shape[0]*100, 100-np.sum(american_movies['movie_runtime'].isna())/american_movies.shape[0]*100))

print('Average runtime of indian movies : %.3f \tAverage runtime of american movies : %.3f' %
(indian_movies['movie_runtime'].mean(), american_movies['movie_runtime'].mean()))


### Distribution of movie runtimes

In [None]:
#defining number of bins to get roughly equal width between the 2 histograms
b = 75
b2 = int(b*indian_movies['movie_runtime'].max()/american_movies['movie_runtime'].drop(index=np.argmax(american_movies['movie_runtime'])).max())

#plotting
plt.figure(figsize=(10,5))
plt.hist(american_movies['movie_runtime'].drop(index=np.argmax(american_movies['movie_runtime'])), density=True, bins = b, alpha = 0.4, label='american')
plt.hist(indian_movies['movie_runtime'], density=True, bins = b2, alpha=.4, label='indian')
plt.yscale('log')
plt.xlabel('minutes')
plt.ylabel('frequency')
plt.title(f'Distribution of runtimes (outlier of removed)')
plt.legend()
plt.show()

**Note** : We found an outlier in the runtimes coming from the CMU dataset of a film of more than 1 million minutes. We think that this outlier is due to an error of the people that made the CMU dataset so we can reasonably remove it from our analysis

### Visualizing the high density range of runtimes (0 to 500 min)

In [None]:
#reduced datasets with movies that have a runtime <500 min
runred_indian = indian_movies[indian_movies['movie_runtime']<500]
runred_american = american_movies[american_movies['movie_runtime']<500]

In [None]:
#defining number of bins to get roughly equal width between the 2 histograms
b = 50
b2 = int(b*indian_movies['movie_runtime'].max()/american_movies['movie_runtime'].drop(index=np.argmax(american_movies['movie_runtime'])).max())

#plotting
plt.figure(figsize=(10,5))
plt.hist(runred_indian['movie_runtime'], density=True, bins = b2, alpha=.4, label='indian')
plt.hist(runred_american['movie_runtime'].drop(np.argmax(runred_american['movie_runtime'])), density=True, bins = b, alpha = 0.4, label='american')
plt.yscale('log')
plt.xlabel('minutes')
plt.ylabel('frequency')
plt.title(f'Distribution of runtimes (zoom on films between 0 and 500 minutes)')
plt.legend()
plt.show()

We notice that indian movies have a tendancy to be slightly longer by ~20 minutes. It would be interesting to further study if this tendancy comes from recent movies or more older ones. Do we tend to a global equilibrium in the runtimes between indian and american movies revealing a standardization?

## Genre analysis

In [None]:
test_indian = indian_movies.copy()
#remove {}
test_indian['cleared_movie_genres'] = test_indian['movie_genres'].str[1:-1]
#remove films without genre
test_indian = test_indian.query("cleared_movie_genres != ''")

test_indian.head()

In [None]:
#create list with all genres
all_genres_listed_indian = test_indian['cleared_movie_genres'].str.split(pat=",")
#all_genres_listed_indian=[ele[0].split(':') for ele in all_genres_listed_indian]
ls = []
for i in all_genres_listed_indian:
    ls.extend(i)
 
all_genres_indian = pd.Series(ls)
all_genres_indian = all_genres_indian.str.strip().value_counts()
all_genres_indian.index = [ele.split(':')[1] for ele in all_genres_indian.index]

In [None]:
#ploting taking the first 50 genres
first_genres_indian = all_genres_indian[:50]
f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(y = first_genres_indian.index, x = first_genres_indian).set_title('Movie genres apparition in indian movies')
sns.despine(left=True, bottom=True)

In [None]:
test_american = american_movies.copy()
#remove {}
test_american['cleared_movie_genres'] = test_american['movie_genres'].str[1:-1]
#remove films without genre
test_american = test_american.query("cleared_movie_genres != ''")
test_american.head()

In [None]:
#create list with all genres
all_genres_listed_american = test_american['cleared_movie_genres'].str.split(pat=",")
#all_genres_listed_american=[ele[0].split(':') for ele in all_genres_listed_american]

ls = []
for i in all_genres_listed_american:
    ls.extend(i)
 
all_genres_american = pd.Series(ls)
all_genres_american = all_genres_american.str.strip().value_counts()
all_genres_american.index=[ele.split(':')[1] for ele in all_genres_american.index]

In [None]:
#ploting taking the first 50 genres
first_genres_american = all_genres_american[:50]

f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(y = first_genres_american.index, x = first_genres_american).set_title('Movie genres apparition in American movies')
sns.despine(left=True, bottom=True)

In [None]:
all_genres_american_frequency = all_genres_american/len(american_movies)
first_genres_american_frequency = all_genres_american_frequency[:50]
all_genres_indian_frequency = all_genres_indian/len(indian_movies)
first_genres_indian_frequency = all_genres_indian_frequency[:50]

In [None]:
#ploting taking the first 50 genres
fig, ax = plt.subplots(1,2,figsize=(10, 16),sharex = True, constrained_layout=True)
fig.suptitle('Movie genres frequency in american and indian films')
ax[0].set_title('America')
ax[1].set_title('India')
sns.barplot(ax = ax[0],y = first_genres_american_frequency.index, x = first_genres_american_frequency, label="American", color="b")

sns.barplot(ax = ax[1], y = first_genres_indian_frequency.index, x = first_genres_indian_frequency, label="Indian", color="r")


### Cleaning data 

Our indian and american movies data base both do not contain any duplicates on either wikipedia movie ID nor freebase ID.

In [None]:
print('wiki ID, indian: ', len(indian_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, indian: ', len(indian_movies.drop_duplicates('freebase_movie_id')))
print('wiki ID, american: ', len(american_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, american: ', len(american_movies.drop_duplicates('freebase_movie_id')))

## Characters data

450'668 characters in raw data

134079 differents actor 

5794 differents actor in indian movies

59398 differents actors in american movies


In [None]:
names = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age_at_movie_release', 'char_act_id', 'freebase_character_id', 'freebase_actor_id']
characters_data = pd.read_csv(data_folder + 'character.metadata.tsv', names = names, sep = '\t')

characters_data.head(10)

In [None]:
characters_data[['character_name', 'actor_name']] = characters_data[['character_name', 'actor_name']].applymap(lambda x: x if type(x)!=str else x.lower())

In [None]:
characters_data.head()

In [None]:
unique_american_actor = characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')
unique_indian_actor = characters_data[characters_data['wikipedia_movie_id'].isin(indian_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')

In [None]:
unique_american_actor.head()

In [None]:
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]

## Name clusters data

In [None]:
names = ['character_name', 'actor_reference']
name_clusters_data = pd.read_csv(data_folder + 'name.clusters.txt', names = names, sep = '\t', )

name_clusters_data.head(10)

## **Tvtropes clusters data**

In [None]:
tvt_rope = pd.read_csv(data_folder + 'tvtropes.clusters.txt', sep='\t', names= ['character_type', 'instances'])

print(len(tvt_rope))
tvt_rope

### Formatting data

In [None]:
tvt_rope['instances'] = tvt_rope['instances'].str.replace('{','').str.replace('}', '').str.replace('"', '')

split_tvt = tvt_rope.copy()

split_tvt = tvt_rope['instances'].str.split('[,:]', expand=True)

cleaned_tvt = split_tvt.rename(columns={split_tvt.columns[1]: 'character_name', split_tvt.columns[3]: 'movie_name', split_tvt.columns[5]: 'char_act_id',split_tvt.columns[7]: 'actor_name'})

cleaned_tvt = cleaned_tvt.drop(columns=[0,2,4,6,8,9,10])

characters = tvt_rope.character_type

final_tvt = cleaned_tvt.join(characters, how= 'left')

final_tvt[['character_name', 'movie_name', 'actor_name', 'character_type']] = final_tvt[['character_name', 'movie_name', 'actor_name', 'character_type']].applymap(lambda x: str.casefold(x))

final_tvt.head()

In [None]:
american_actors = unique_american_actor.copy()
american_actors['actor_name'] = unique_american_actor['actor_name'].astype('str')

american_actors = american_actors['actor_name']

final_tvt.actor_name = final_tvt.actor_name.dropna()

american_tvt = final_tvt.merge(american_actors, on = 'actor_name')

american_tvt

In [None]:
american_tvt2 = final_tvt[final_tvt['actor_name'].isin(unique_american_actor['actor_name'])]

american_tvt2

In [None]:
american_tvt2 = final_tvt[final_tvt['movie_name'].isin(american_movies['movie_name'])]

american_tvt2

## Summaries data

In [None]:
summaries = pd.read_csv(data_folder + 'plot_summaries.txt', sep = '\r', names = ['summaries'])
summaries = summaries['summaries'].str.split("\t", expand = True)
summaries = summaries.rename(columns= {0:'wikipedia_movie_id',1: 'summaries'})
summaries['summaries'] = summaries['summaries'].str.lower()
summaries['wikipedia_movie_id'] = summaries['wikipedia_movie_id'].astype(int)
summaries.head(10)

In [None]:
indian_summaries = summaries[summaries['wikipedia_movie_id'].isin(indian_movies['wikipedia_movie_id'])]
indian_summaries

In [None]:
american_summaries = summaries[summaries['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]
american_summaries

In [None]:
indian_separated_words = indian_summaries['summaries'].str.split()

ls = []
 
for i in indian_separated_words:
    ls.extend(i)

indian_separated_words = pd.Series(ls)
indian_separated_words = indian_separated_words[indian_separated_words.str.len() > 4]
indian_separated_words = indian_separated_words.value_counts()

In [None]:
indian_separated_words = indian_separated_words[:100]
f, ax = plt.subplots(figsize=(6, 20))
sns.barplot(y = indian_separated_words.index, x = indian_separated_words)

In [None]:
american_separated_words = american_summaries['summaries'].str.split()

ls = []
 
for i in american_separated_words:
    ls.extend(i)

    
american_separated_words = pd.Series(ls)
american_separated_words = american_separated_words[american_separated_words.str.len() > 4]
american_separated_words = american_separated_words.value_counts()

In [None]:
american_separated_words = american_separated_words[:100]
f, ax = plt.subplots(figsize=(6, 20))
sns.barplot(y = american_separated_words.index, x = american_separated_words)