In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

## Import processed data

In [None]:
folder_processed_data_path = './data/processed_data/'

movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in movie_df ? {movie_df.index.is_unique}")

character_df = pd.read_csv(os.path.join(folder_processed_data_path, 'character_df.csv'))
character_df.set_index(['wiki_ID', 'char_name'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in character_df ? {character_df.index.is_unique}")

name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_df.csv'))
# Verify the indexes are unique
print(f"Is the indexing unique in name_by_movie_df ? {name_by_movie_df.index.is_unique}")

baby_name_df = pd.read_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'))
baby_name_df.set_index(['name', 'year'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in baby_name_df ? {baby_name_df.index.is_unique}")

rating_df = pd.read_csv(os.path.join(folder_processed_data_path, 'rating_df.csv'))
rating_df.set_index(['tconst'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in rating_df ? {rating_df.index.is_unique}")

print("movie_df :")
display(movie_df.head())
print("character_df :")
display(character_df.head())
print("name_by_movie_df :")
display(name_by_movie_df.head())
print("baby_name_df :")
display(baby_name_df.head())
print("rating_df :")
display(rating_df.head())

## Visualisation for a specific name

Let's chose a name and visualize the variation of the baby names. <br>
To do so, we first find all the movies where a character is named as the chosen name

In [None]:
chosen_name = "Mia"

filt_name_by_movie_df = name_by_movie_df.query("char_words == @chosen_name").copy(deep=True)
filt_name_by_movie_df.reset_index(inplace=True)
filt_name_by_movie_df.drop('char_words', axis=1, inplace=True)
display(filt_name_by_movie_df.head())
print(f"There are {len(filt_name_by_movie_df)} movies with a character named {chosen_name}.")

In [None]:
filt_movie_df = pd.merge(movie_df, filt_name_by_movie_df, on='wiki_ID', how='inner').copy(deep=True)
display(filt_movie_df.head())

There are too many movies. Lets keep only the 10 with the largest number of ratings.

In [None]:
sorted_filt_movie_df = filt_movie_df.sort_values(by='numVotes', ascending=False).copy(deep=True)

filt_movie_df = sorted_filt_movie_df.head(10).copy(deep=True)
display(filt_movie_df)

In [None]:
display(baby_name_df.loc[chosen_name].sort_values(by=['year']))

percentage_df = baby_name_df.loc[chosen_name]['percentage'].to_frame().reset_index()
percentage_df.sort_values(by=['year'], ascending=True, inplace=True)

x_values = percentage_df['year'].values
y_values = percentage_df['percentage'].values

# Create a scatter plot
plt.plot(x_values, y_values)
plt.xlabel('Years')
plt.ylabel('Percentage of total births')
plt.title(f'Given name {chosen_name} with ten most reviewed \n movies with a character of the same name')

# Draw vertical lines for each movie release
vertical_lines_series = filt_movie_df['release'].values
for x_value in vertical_lines_series:
    plt.axvline(x=x_value, color='r', linestyle='--', label=f'x={x_value}')

plt.show()