In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import scipy.stats as st
sns.set_theme(style="whitegrid")

## Movies Metadata

### **Data description**

| Column name          | Description                                                                                                                                                                                       |   |   |   |
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---|---|
| wikipedia_movie_id | ID of the movie from wikipedia                                                                                                                                                 |   |   |   |
| freebase_movie_id| ID of the movie from freebas                                                                                                                                            |   |   |   |
| movie_name | Name of the movie                                                                                                                                                |   |   |   |
| movie_release_date  | Date the movie was released                                                                                                                                      |   |   |   |
| movie_box_office_revenue  | Revenue of the movie box office                                                                                                                           
| movie_runtime  | Run time of the movie                                                                                                                                                 |   |   |   |
| movie_languages | Languages of the movie                                                                                                                                                  |   |   |   |
| movie_countries | Countries where the movie were created                                                                                                                                  |   |   |   |
| movie_genres   | Genre of the movie                                                                                                                                              |   |   |   |

The movie data set contains 81741 rows.



In [None]:
data_folder = './data/'

names = ['wikipedia_movie_id','freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 
        'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']

movies_data = pd.read_csv(data_folder + 'movie.metadata.tsv', names = names, sep = '\t', )

movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']]= movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']].applymap(lambda x: str.lower(x))

movies_data.head()

In [None]:
#rating files loading
ratings = pd.read_csv(data_folder + 'title.ratings.tsv.gz', sep='\t', compression='gzip')
titles = pd.read_csv(data_folder + 'title.basics.tsv.gz', sep='\t', compression='gzip')

rates = ratings.merge(titles, how='left', on='tconst')[['averageRating', 'numVotes', 'originalTitle']]
rates['movie_name'] = [ele.lower() for ele in rates['originalTitle'].astype(str)]
rates=rates.drop(columns='originalTitle')
rates = rates.drop_duplicates('movie_name')

In [None]:
movies_data.shape

In [None]:
indian_movies = movies_data[movies_data['movie_countries'] == '{"/m/03rk0": "india"}']
american_movies = movies_data[movies_data['movie_countries'] == '{"/m/09c7w0": "united states of america"}']

indian_movies = indian_movies.merge(rates, how = 'left', on='movie_name')
american_movies = american_movies.merge(rates, how = 'left', on='movie_name')

print(len(indian_movies), len(american_movies))

In [None]:
print('%.3f percent of indian movies have a rating and %.3f percent of american movies have a rating' %
(100-np.sum(indian_movies['averageRating'].isna())/indian_movies.shape[0]*100, 100-np.sum(american_movies['averageRating'].isna())/american_movies.shape[0]*100))

print('Average rating of indian movies : %.3f \tAverage rating of american movies : %.3f' %
(np.mean(indian_movies['averageRating']), np.mean(american_movies['averageRating'])))

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
sns.histplot(x='averageRating', data=indian_movies)
plt.title('Distribution of indian movies ratings')
plt.subplot(1,2,2)
sns.histplot(x='averageRating', data=american_movies)
plt.title('Distribution of american movies ratings')
plt.show()

In [None]:
fac = (len(indian_movies)-np.sum(indian_movies['averageRating'].isna()))/(len(american_movies)-np.sum(american_movies['averageRating'].isna()))
plt.figure(figsize=(10,5))
plt.hist(indian_movies['numVotes'], density=True, bins = 15, alpha=.4, label='indian')
plt.hist(american_movies['numVotes'], density=True, bins = int(15/fac), alpha = 0.4, label='american')
plt.yscale('log')
plt.title('American movies received much more votes than indian movies')
plt.legend()
plt.show()

In [None]:
print('P-value for normality test of american movies : %.3f \tP-value for normality test of indian movies : %.3f ' 
      % (st.normaltest(american_movies['averageRating'], nan_policy='omit').pvalue, st.normaltest(indian_movies['averageRating'], nan_policy='omit').pvalue))

In [None]:
#center data on mean
indian_movies['centeredRating']=indian_movies['averageRating']-np.mean(indian_movies['averageRating'])
american_movies['centeredRating']=american_movies['averageRating']-np.mean(american_movies['averageRating'])
print('Centered average rating of indian movies : %.3f \tCentered average rating of american movies : %.3f' %
(np.mean(indian_movies['centeredRating']), np.mean(american_movies['centeredRating'])))

In [None]:
indian_movies.head()

## Numbers of Indian vs American movies per year

In [None]:
indian_movies_years = indian_movies.copy()

#slice the movie release date column to only have the years, and transform them as integer again
indian_movies_years['movie_release_date'] = indian_movies['movie_release_date'].dropna().astype('str').apply(lambda s: s[0:4]).astype('int')

#count the numbers of movies per year there are for indian movies
n_movie_per_year_indian = indian_movies_years.groupby('movie_release_date')['movie_name'].agg(['count'])

#change the movie release date as a column
n_movie_per_year_indian = pd.DataFrame(n_movie_per_year_indian).reset_index()

#sort the values
n_movie_per_year_indian['movie_release_date'] = n_movie_per_year_indian['movie_release_date'].sort_values()

n_movie_per_year_indian


In [None]:
american_movies_years = american_movies.copy()

#slice the movie release date column to only have the years, and transform them as integer again
american_movies_years['movie_release_date'] = american_movies['movie_release_date'].dropna().astype('str').apply(lambda s: s[0:4]).astype('int')

#count the numbers of movies per year there are for american movies
n_movie_per_year_american = american_movies_years.groupby('movie_release_date')['movie_name'].agg(['count'])

#change the movie release date as a column
n_movie_per_year_american = pd.DataFrame(n_movie_per_year_american).reset_index()

#sort the values
n_movie_per_year_american['movie_release_date'] = n_movie_per_year_american['movie_release_date'].sort_values()

n_movie_per_year_american

Let us slice the data to have the same time frame: from 1912 to 2014 for both Indian and American movies.

In [None]:
#Slice the data to have same time frame for both Indian and American movies data sets
n_movie_per_year_american = n_movie_per_year_american.query('movie_release_date <= 2014 & movie_release_date >= 1912')

In [None]:
#Plot both Indian and American numbers of movies per year
sns.lineplot(data= n_movie_per_year_indian, x='movie_release_date', y = 'count', label='indian')
sns.lineplot(data= n_movie_per_year_american, x='movie_release_date', y = 'count', label='american')
plt.title('Indian vs American movies per year ')
plt.xlabel('Movie release year')
plt.ylabel('Number of movies per year')

We can observe a peak starting to grow from the 2000 years for both Hollywood and Bollywood movies.

In [None]:
#Zoom in during the peak of Indian and American released movie years
sns.lineplot(data= n_movie_per_year_indian, x='movie_release_date', y = 'count', label='indian')
sns.lineplot(data= n_movie_per_year_american, x='movie_release_date', y = 'count', label='american')
plt.title('Indian vs American movies per year, from the 2000s')
plt.xlabel('Movie release year')
plt.ylabel('Number of movies per year')
plt.axis([2000, 2014, 0, 1000])

The peak of released movies are at 2006 for Hollywood and 2008 Bollywood respectively.

At 2014 it goes back to nearly 0 films released, but we should investigate carefully the data set that explain this lack of data.

## Movie Genre


In [None]:
test_indian = indian_movies.copy()
#remove {}
test_indian['cleared_movie_genres'] = test_indian['movie_genres'].str[1:-1]
#remove films without genre
test_indian = test_indian.query("cleared_movie_genres != ''")

test_indian.head()

In [None]:
#create list with all genres
all_genres_listed_indian = test_indian['cleared_movie_genres'].str.split(pat=",")
#all_genres_listed_indian=[ele[0].split(':') for ele in all_genres_listed_indian]
ls = []
for i in all_genres_listed_indian:
    ls.extend(i)
 
all_genres_indian = pd.Series(ls)
all_genres_indian = all_genres_indian.str.strip().value_counts()
all_genres_indian.index = [ele.split(':')[1] for ele in all_genres_indian.index]

In [None]:
#ploting taking the first 50 genres
first_genres_indian = all_genres_indian[:50]
f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(y = first_genres_indian.index, x = first_genres_indian).set_title('Movie genres apparition in indian movies')
sns.despine(left=True, bottom=True)

In [None]:
test_american = american_movies.copy()
#remove {}
test_american['cleared_movie_genres'] = test_american['movie_genres'].str[1:-1]
#remove films without genre
test_american = test_american.query("cleared_movie_genres != ''")
test_american.head()

In [None]:
#create list with all genres
all_genres_listed_american = test_american['cleared_movie_genres'].str.split(pat=",")
#all_genres_listed_american=[ele[0].split(':') for ele in all_genres_listed_american]

ls = []
for i in all_genres_listed_american:
    ls.extend(i)
 
all_genres_american = pd.Series(ls)
all_genres_american = all_genres_american.str.strip().value_counts()
all_genres_american.index=[ele.split(':')[1] for ele in all_genres_american.index]

In [None]:
#ploting taking the first 50 genres
first_genres_american = all_genres_american[:50]

f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(y = first_genres_american.index, x = first_genres_american).set_title('Movie genres apparition in American movies')
sns.despine(left=True, bottom=True)

In [None]:
all_genres_american_frequency = all_genres_american/len(american_movies)
first_genres_american_frequency = all_genres_american_frequency[:50]
all_genres_indian_frequency = all_genres_indian/len(indian_movies)
first_genres_indian_frequency = all_genres_indian_frequency[:50]

In [None]:
#ploting taking the first 50 genres
fig, ax = plt.subplots(1,2,figsize=(10, 16),sharex = True, constrained_layout=True)
fig.suptitle('Movie genres frequency in american and indian films')
ax[0].set_title('America')
ax[1].set_title('India')
sns.barplot(ax = ax[0],y = first_genres_american_frequency.index, x = first_genres_american_frequency, label="American", color="b")

sns.barplot(ax = ax[1], y = first_genres_indian_frequency.index, x = first_genres_indian_frequency, label="Indian", color="r")


## Movie languages

In this section, we will investigate some disparities or resemblence of the available languages of the film.

### Indian

In [None]:
indian_languages= indian_movies.copy()
#remove the {} in each rows
indian_languages['cleared_movie_languages'] = indian_languages['movie_languages'].str[1:-1]

#remove the films without any languages
indian_languages = indian_languages.query("cleared_movie_languages != ''")

indian_languages.head()

In [None]:
#Create a list with all languages
all_languages_listed_indian = indian_languages['cleared_movie_languages'].str.split(pat=",")

#Drop the NAs
all_languages_listed_indian = all_languages_listed_indian.dropna()

#Create a list with all the languages in it
ls = []
for i in all_languages_listed_indian:
    ls.extend(i)
 
#Count the total number of languages that are in the movies for each one of the languages
all_languages_indian = pd.Series(ls)

#Strip the strings and count the numbers of occurence of each languages
all_languages_indian = all_languages_indian.str.strip().value_counts()

all_languages_indian.head()

In [None]:
all_languages_indian_code = all_languages_indian.copy()

#Put it as data frame
all_languages_indian_code= pd.DataFrame(all_languages_indian_code)

#Reset the index to have the index as column
all_languages_indian_code = all_languages_indian_code.reset_index()

#Rename the columns correctly
all_languages_indian_code = all_languages_indian_code.rename(columns={"index": "languages", 0:"count"})

#Remove the code part (that begins with "/m")
all_languages_indian_code['languages'] = all_languages_indian_code['languages'].apply(lambda s: re.sub(r'/m.+ "', '',s))

#Only take the top 20 languages to plot it 
top_20_ind = all_languages_indian_code.iloc[:20]

top_20_ind.head()

In [None]:
#Plotting top 20 languages (39 in total for indian languages)
f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(data = top_20_ind,y = 'languages', x = 'count').set_title('Languages apparition in Indian movies')
ax.set_xscale('log')
sns.despine(left=True, bottom=True)

### American

In [None]:
american_languages = american_movies.copy()
#Remove the {} in the column
american_languages['cleared_movie_languages'] = american_languages['movie_languages'].str[1:-1]

#Remove films without languages
american_languages = american_languages.query("cleared_movie_languages != ''")

american_languages.head()

In [None]:
#Create list with all languages
all_languages_listed_american = american_languages['cleared_movie_languages'].str.split(pat=",")

#Drop the NAs
all_languages_listed_american = all_languages_listed_american.dropna()

#Create a list with all the languages in it
ls = []
for i in all_languages_listed_american:
    ls.extend(i)
 
all_languages_american = pd.Series(ls)

#Count the total number of languages that are in the movies for each one of the languages
all_languages_american = all_languages_american.str.strip().value_counts()

all_languages_american.head()

In [None]:
all_languages_american_code =  all_languages_american.copy()

#Put it as data frame
all_languages_american_code = pd.DataFrame(all_languages_american_code)

#Reset the index to have the index as column
all_languages_american_code = all_languages_american_code.reset_index()

#Rename the columns correctly
all_languages_american_code = all_languages_american_code.rename(columns={"index": "languages", 0:"count"})

#Remove the code part (that begins with "/m")
all_languages_american_code['languages'] = all_languages_american_code['languages'].apply(lambda s: re.sub(r'/m.+ "', '',s))

#Only take the top 20 languages to plot it 
top_20_american = all_languages_american_code.iloc[:20]

top_20_american.head()

In [None]:
#Plotting top 20 languages (111 in total for American movies)
f, ax = plt.subplots(figsize=(6, 10))
sns.barplot(data = top_20_american, y = 'languages', x = 'count').set_title('Languages apparition in American movies')
ax.set_xscale('log')
sns.despine(left=True, bottom=True)

In [None]:
#Compute the frequency of top 20 for the American and Indian languages
all_languages_american_code['frequency'] = all_languages_american_code['count'] / all_languages_american_code['count'].sum()
first_languages_american_frequency = all_languages_american_code[:20]

all_languages_indian_code['frequency'] = all_languages_indian_code['count']/all_languages_indian_code['count'].sum()
first_languages_indian_frequency = all_languages_indian_code[:20]


In [None]:
#Plotting taking the first 20 languages for American films and Indian movies
fig, ax = plt.subplots(1,2,figsize=(10, 16),sharex = True, constrained_layout=True)
fig.suptitle('Top 20 movie languages frequency in American and Indian movies')
ax[0].set_title('American movies')
ax[1].set_title('Indian movies')

sns.barplot(ax = ax[0],data = first_languages_american_frequency, y = 'languages', x = 'frequency', label="American", color="b")

sns.barplot(ax = ax[1], data = first_languages_indian_frequency, y = 'languages', x = 'frequency', label="Indian", color="r")

#Setting logarithmic scale 
ax[0].set_xscale('log')
ax[1].set_xscale('log')


#### How many languages per movies there are, on average ? 

In [None]:
#Compute the number of ':' that represent the number of languages there are in each row for American movies
american_languages['n_languages'] = american_languages.cleared_movie_languages.str.count(':')

american_languages.head()

In [None]:
#Compute the number of ':' that represent the number of languages there are in each row for Indian movies
indian_languages['n_languages'] = indian_languages.cleared_movie_languages.str.count(':')

indian_languages.head()

In [None]:
#Drop the movies that do not have any languages
n_indian_lan = indian_languages['n_languages'].dropna()

n_american_lan = american_languages['n_languages'].dropna()

#Compute the t-test to compare the number of available languages per movies for Indian and American movies
t_test_languages = st.ttest_ind(n_indian_lan, n_american_lan)

t_test_languages.pvalue

print('p-value for languages: ',t_test_languages.pvalue, '\t', 'p-value is smaller than 0.05: ', t_test_languages.pvalue <0.05)

There is a significant difference of the numbers of languages spoken in American vs Indian movies, so it could be that there are underlying effects, like Hollywood are more exported abroad than Bollywood movies.

### Missing values

We can already see that movie_box_office_revenue column contain loads of missing data in both indian and american movies, followed by movie runtime.

In [None]:
#printing infomation about indian movies data set 
indian_movies.info()

In [None]:
#printing infomation about american movies data set 
american_movies.info()

### Cleaning data 

Our indian and american movies data base both do not contain any duplicates on either wikipedia movie ID nor freebase ID.

In [None]:
print('wiki ID, indian: ', len(indian_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, indian: ', len(indian_movies.drop_duplicates('freebase_movie_id')))
print('wiki ID, american: ', len(american_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, american: ', len(american_movies.drop_duplicates('freebase_movie_id')))

## Characters data

450'668 characters in raw data

134079 differents actor 

5794 differents actor in indian movies

59398 differents actors in american movies


In [None]:
names = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age_at_movie_release', 'char_act_id', 'freebase_character_id', 'freebase_actor_id']
characters_data = pd.read_csv(data_folder + 'character.metadata.tsv', names = names, sep = '\t')

characters_data.head(10)

In [None]:
characters_data[['character_name', 'actor_name']] = characters_data[['character_name', 'actor_name']].applymap(lambda x: x if type(x)!=str else x.lower())

In [None]:
characters_data.head()

In [None]:
unique_american_actor = characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')
unique_indian_actor = characters_data[characters_data['wikipedia_movie_id'].isin(indian_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')

In [None]:
unique_american_actor.head()

In [None]:
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]

## Name clusters data

In [None]:
names = ['character_name', 'actor_reference']
name_clusters_data = pd.read_csv(data_folder + 'name.clusters.txt', names = names, sep = '\t', )

name_clusters_data.head(10)

## **Tv tropes clusters data**

In [None]:
#Read tv tropes data
tv_trope = pd.read_csv(data_folder + 'tvtropes.clusters.txt', sep='\t', names= ['character_type', 'instances'])

tv_trope.head()

### Formatting data

In [None]:
#Remove {} strings
tv_trope['instances'] = tv_trope['instances'].str[1:-1]

split_tv = tv_trope.copy()

#Split on , and : to separate into different columns
split_tv = tv_trope['instances'].str.split('[,:]', expand=True)

#Name the columns accordingly
cleaned_tv = split_tv.rename(columns={split_tv.columns[1]: 'character_name', split_tv.columns[3]: 'movie_name', split_tv.columns[5]: 'char_act_id',split_tv.columns[7]: 'actor_name'})

#Drop the un-necessary columns
cleaned_tv = cleaned_tv.drop(columns=[0,2,4,6,8,9,10])

#Remove the ""
cleaned_tv = cleaned_tv.applymap(lambda s : re.sub(r'"', " ", s))

#Make everything lowercases 
cleaned_tv = cleaned_tv.applymap(lambda x: str.casefold(x))

characters = tv_trope.character_type

#Join to have the characters too 
final_tv = cleaned_tv.join(characters, how= 'left')

final_tv.head()

In [None]:
#Check if there is any movies in the tv tropes present in our movie data set
list = final_tv['movie_name']

movies_data['movie_name'].where(movies_data['movie_name'].isin(list))

In [None]:
#Check if there is any actors in the tv tropes present in our movie data set
list_1 = final_tv['actor_name']

movies_data['movie_name'].where(movies_data['movie_name'].isin(list_1)).dropna()

It seems that the data set is too small and do not contain information on any films or actors contained in the movie data character...

## Summaries data

In [None]:
summaries = pd.read_csv(data_folder + 'plot_summaries.txt', sep = '\r', names = ['summaries'])
summaries = summaries['summaries'].str.split("\t", expand = True)
summaries = summaries.rename(columns= {0:'wikipedia_movie_id',1: 'summaries'})
summaries['summaries'] = summaries['summaries'].str.lower()
summaries['wikipedia_movie_id'] = summaries['wikipedia_movie_id'].astype(int)
summaries.head(10)

In [None]:
indian_summaries = summaries[summaries['wikipedia_movie_id'].isin(indian_movies['wikipedia_movie_id'])]
indian_summaries

In [None]:
american_summaries = summaries[summaries['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]
american_summaries

In [None]:
indian_separated_words = indian_summaries['summaries'].str.split()

ls = []
 
for i in indian_separated_words:
    ls.extend(i)

indian_separated_words = pd.Series(ls)
indian_separated_words = indian_separated_words[indian_separated_words.str.len() > 4]
indian_separated_words = indian_separated_words.value_counts()

In [None]:
indian_separated_words = indian_separated_words[:100]
f, ax = plt.subplots(figsize=(6, 20))
sns.barplot(y = indian_separated_words.index, x = indian_separated_words)

In [None]:
american_separated_words = american_summaries['summaries'].str.split()

ls = []
 
for i in american_separated_words:
    ls.extend(i)

    
american_separated_words = pd.Series(ls)
american_separated_words = american_separated_words[american_separated_words.str.len() > 4]
american_separated_words = american_separated_words.value_counts()

In [None]:
american_separated_words = american_separated_words[:100]
f, ax = plt.subplots(figsize=(6, 20))
sns.barplot(y = american_separated_words.index, x = american_separated_words)