In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## Movies Metadata

### **Data description**

| Column name          | Description                                                                                                                                                                                       |   |   |   |
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---|---|
| wikipedia_movie_id | ID of the movie from wikipedia                                                                                                                                                 |   |   |   |
| freebase_movie_id| ID of the movie from freebas                                                                                                                                            |   |   |   |
| movie_name | Name of the movie                                                                                                                                                |   |   |   |
| movie_release_date  | Date the movie was released                                                                                                                                      |   |   |   |
| movie_box_office_revenue  | Revenue of the movie box office                                                                                                                           
| movie_runtime  | Run time of the movie                                                                                                                                                 |   |   |   |
| movie_languages | Languages of the movie                                                                                                                                                  |   |   |   |
| movie_countries | Countries where the movie were created                                                                                                                                  |   |   |   |
| movie_genres   | Genre of the movie                                                                                                                                              |   |   |   |

The movie data set contains 81741 rows.



In [None]:
data_folder = './data/'

names = ['wikipedia_movie_id','freebase_movie_id', 'movie_name', 'movie_release_date', 'movie_box_office_revenue', 
        'movie_runtime', 'movie_languages', 'movie_countries', 'movie_genres']

movies_data = pd.read_csv(data_folder + 'movie.metadata.tsv', names = names, sep = '\t', )

movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']]= movies_data[['movie_name', 'movie_languages', 'movie_countries', 'movie_genres']].applymap(lambda x: str.lower(x))

movies_data.head()

In [None]:
indian_movies = movies_data[movies_data['movie_countries'] == '{"/m/03rk0": "india"}']

american_movies = movies_data[movies_data['movie_countries'] == '{"/m/09c7w0": "united states of america"}']

print(len(indian_movies), len(american_movies))

In [None]:
indian_movies.head()

### Missing values

We can already see that movie_box_office_revenue column contain loads of missing data in both indian and american movies, followed by movie runtime.

In [None]:
indian_movies.info()

### Cleaning data 

Our indian and american movies data base both do not contain any duplicates on either wikipedia movie ID nor freebase ID.

In [None]:
print('wiki ID, indian: ', len(indian_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, indian: ', len(indian_movies.drop_duplicates('freebase_movie_id')))
print('wiki ID, american: ', len(american_movies.drop_duplicates('wikipedia_movie_id')), '\nfreebase ID, american: ', len(american_movies.drop_duplicates('freebase_movie_id')))

## Characters data

450'668 characters in raw data

134079 differents actor 

5794 differents actor in indian movies

59398 differents actors in american movies


In [None]:
names = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age_at_movie_release', 'char_act_id', 'freebase_character_id', 'freebase_actor_id']
characters_data = pd.read_csv(data_folder + 'character.metadata.tsv', names = names, sep = '\t')

characters_data.head(10)

In [None]:
characters_data[['character_name', 'actor_name']] = characters_data[['character_name', 'actor_name']].applymap(lambda x: x if type(x)!=str else x.lower())

In [None]:
characters_data.head()

In [None]:
unique_american_actor = characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')
unique_indian_actor = characters_data[characters_data['wikipedia_movie_id'].isin(indian_movies['wikipedia_movie_id'])].drop_duplicates('actor_name')

In [None]:
unique_american_actor.head()

In [None]:
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]
american_character =characters_data[characters_data['wikipedia_movie_id'].isin(american_movies['wikipedia_movie_id'])]

## Name clusters data

In [None]:
names = ['character_name', 'actor_reference']
name_clusters_data = pd.read_csv(data_folder + 'name.clusters.txt', names = names, sep = '\t', )

name_clusters_data.head(10)

## **Tvtropes clusters data**

In [None]:
tvt_rope = pd.read_csv(data_folder + 'tvtropes.clusters.txt', sep='\t', names= ['character_type', 'instances'])

print(len(tvt_rope))
tvt_rope

### Formatting data

In [None]:
tvt_rope['instances'] = tvt_rope['instances'].str.replace('{','').str.replace('}', '').str.replace('"', '')

split_tvt = tvt_rope.copy()

split_tvt = tvt_rope['instances'].str.split('[,:]', expand=True)

cleaned_tvt = split_tvt.rename(columns={split_tvt.columns[1]: 'character_name', split_tvt.columns[3]: 'movie_name', split_tvt.columns[5]: 'char_act_id',split_tvt.columns[7]: 'actor_name'})

cleaned_tvt = cleaned_tvt.drop(columns=[0,2,4,6,8,9,10])

characters = tvt_rope.character_type

final_tvt = cleaned_tvt.join(characters, how= 'left')

final_tvt[['character_name', 'movie_name', 'actor_name', 'character_type']] = final_tvt[['character_name', 'movie_name', 'actor_name', 'character_type']].applymap(lambda x: str.casefold(x))

final_tvt.head()

In [None]:
american_actors = unique_american_actor.copy()
american_actors['actor_name'] = unique_american_actor['actor_name'].astype('str')

american_actors = american_actors['actor_name']

final_tvt.actor_name = final_tvt.actor_name.dropna()

american_tvt = final_tvt.merge(american_actors, on = 'actor_name')

american_tvt

In [None]:
american_tvt2 = final_tvt[final_tvt['actor_name'].isin(unique_american_actor['actor_name'])]

american_tvt2

In [None]:
american_tvt2 = final_tvt[final_tvt['movie_name'].isin(american_movies['movie_name'])]

american_tvt2