In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
raw_folder = './raw_data/'
imdb_rating_folder = raw_folder + 'imdb_rating/'
cmu_folder = raw_folder + 'cmu/'
baby_names_folder = raw_folder + 'baby_names_national/'

## Data importation

### CMU data

Character metadata :

In [3]:
# Import character metadata
character_df = pd.read_csv(cmu_folder + 'character.metadata.tsv', sep='\t', header=None)

# Add column names deduced from README
character_df.columns = ['wiki_ID', 'free_ID', 'release', 'char_name', 'DOB', 'gender', 'height', 'ethnicity', 'act_name', 'age_at_release', 'free_char_map1', 'free_char_map2', 'free_char_map3']

display(character_df.head(2))

Unnamed: 0,wiki_ID,free_ID,release,char_name,DOB,gender,height,ethnicity,act_name,age_at_release,free_char_map1,free_char_map2,free_char_map3
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


Movie metaData

In [4]:
# Import movie metadata
movie_df = pd.read_csv(cmu_folder + 'movie.metadata.tsv', sep='\t', header=None)

# Add column names deduced from README
movie_df.columns = ['wiki_ID', 'free_ID', 'mov_name', 'release', 'revenue', 'runtime', 'languages', 'countries', 'genres']
display(movie_df.head(2))
print(f"Is the wiki_ID unique ? {movie_df.wiki_ID.is_unique}")

Unnamed: 0,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


Is the wiki_ID unique ? True


### IMDB data

In [5]:
# Import imdb data
rating_df = pd.read_csv(imdb_rating_folder + 'imdb_rating.tsv', sep='\t', index_col='tconst')
display(rating_df.head(2))

Unnamed: 0_level_0,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,5.7,2004
tt0000002,5.8,269


In [6]:
# Verify the indexes are unique
print(f"Is the indexing unique ? {rating_df.index.is_unique}")

Is the indexing unique ? True


### Given name data

In [7]:
# import name dataset

# Define the path to your dataset folder
folder_path = 'raw_data/baby_names_national/'

# Create an empty list to store individual DataFrames
data_frames = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.startswith('yob') and filename.endswith('.txt'):
        # Extract the year from the filename
        year = int(filename[3:-4])

        # Read the data from the file into a DataFrame
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, header=None, names=['name', 'gender', 'number'])

        # Add the 'year' column to the DataFrame
        df['year'] = year

        # Append the current DataFrame to the list
        data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
baby_name_df = pd.concat(data_frames, ignore_index=True)

# Set the 'name' column as the index
# combined_data.set_index('name', inplace=True)

# Display the resulting DataFrame
baby_name_df.head(2)

Unnamed: 0,name,gender,number,year
0,Mary,F,7065,1880
1,Anna,F,2604,1880


## Movie metadata exploration

In [8]:
movie_df.head(2)

Unnamed: 0,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [9]:
lengths_of_strings = movie_df['release'].apply(lambda x: len(str(x)))
counts_lengths = lengths_of_strings.value_counts()
print(f"In the release column of movie_df the string have the following lengths with their frequency : \n\n{counts_lengths}")

In the release column of movie_df the string have the following lengths with their frequency : 

release
10    39373
4     32172
3      6902
7      3294
Name: count, dtype: int64


In [10]:
length3_test = movie_df[movie_df['release'].apply(lambda x: len(str(x)) == 3)].iloc[0]['release']
print(type(length3_test))

<class 'float'>


In [11]:
movie_df.dtypes

wiki_ID        int64
free_ID       object
mov_name      object
release       object
revenue      float64
runtime      float64
languages     object
countries     object
genres        object
dtype: object

lets check the percentage of missing  values in the revenue attribut of the movie dataframe

In [12]:
nb_revenue_missing = movie_df['revenue'].isna().sum()
total_movies = len(movie_df)
perc_missing = (nb_revenue_missing / total_movies)*100
print(f"Percentage of missing values in column 'revenue': {perc_missing:.2f}%")

Percentage of missing values in column 'revenue': 89.72%


Lets check the values of the genre attribute

In [13]:
# Check for NaN values in the specified column
is_there_nan = movie_df['genres'].isna().any()
print("There are NaN values : ", is_there_nan)

There are NaN values :  False


## Baby name data preprocessing

Combine the rows with the same name and same year in order to ignore the gender in the dataset

In [14]:
baby_name_filtered_df = baby_name_df.groupby(['name', 'year'])['number'].sum().to_frame()

display(baby_name_filtered_df.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,number
name,year,Unnamed: 2_level_1
Aaban,2007,5
Aaban,2009,6
Aaban,2010,9
Aaban,2011,11
Aaban,2012,11


For each datapoint, compute the percentage of the total births in the year

In [15]:
# group the baby name dataframe by year to get the number of birth per year.
birth_per_year_df = baby_name_filtered_df.groupby('year')['number'].sum().to_frame()
birth_per_year_df.reset_index(inplace=True)

birth_per_year_df = birth_per_year_df.rename(columns={'number': 'total_number'})

print("birth_per_year_df:")
display(birth_per_year_df.head())


# Merge dataframes
merged_df = pd.merge(baby_name_filtered_df.reset_index(), birth_per_year_df, on='year')

# Calculate the percentage and add it as a new column to dataframe
merged_df['percentage'] = (merged_df['number'] / merged_df['total_number']) * 100

print("merged_df:")
display(merged_df.head())

baby_name_with_percentage_df = merged_df.drop('total_number', axis=1)
baby_name_with_percentage_df.set_index(['name', 'year'], inplace=True)

print("baby_name_with_percentage_df:")
display(baby_name_with_percentage_df.head())


birth_per_year_df:


Unnamed: 0,year,total_number
0,1880,201484
1,1881,192690
2,1882,221533
3,1883,216945
4,1884,243461


merged_df:


Unnamed: 0,name,year,number,total_number,percentage
0,Aaban,2007,5,3996648,0.000125
1,Aadam,2007,8,3996648,0.0002
2,Aadan,2007,8,3996648,0.0002
3,Aadarsh,2007,13,3996648,0.000325
4,Aaden,2007,157,3996648,0.003928


baby_name_with_percentage_df:


Unnamed: 0_level_0,Unnamed: 1_level_0,number,percentage
name,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaban,2007,5,0.000125
Aadam,2007,8,0.0002
Aadan,2007,8,0.0002
Aadarsh,2007,13,0.000325
Aaden,2007,157,0.003928


## Character metadata preprocessing

The character dataframe has not a unique index. It is due to the several Nan values present in the same movie. To tackles this we can drop the rows that have a NaN as name and see if it solve the issue

In [16]:
character_filtered = character_df.copy(deep=True)

# drop the rows with a NaN as the character name
character_filtered = character_filtered.dropna(subset=['char_name'])
# drop the rows with the same character name
character_filtered = character_filtered.drop_duplicates(subset=['wiki_ID','char_name'])
character_filtered = character_filtered.set_index(['wiki_ID','char_name'])

display(character_filtered.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,free_ID,release,DOB,gender,height,ethnicity,act_name,age_at_release,free_char_map1,free_char_map2,free_char_map3
wiki_ID,char_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
975900,Akooshay,/m/03vyhn,2001-08-24,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
975900,Lieutenant Melanie Ballard,/m/03vyhn,2001-08-24,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
975900,Desolation Williams,/m/03vyhn,2001-08-24,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
975900,Sgt Jericho Butler,/m/03vyhn,2001-08-24,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
975900,Bashira Kincaid,/m/03vyhn,2001-08-24,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


We drop all the columns related to FreeBase as we won't be using them

In [58]:
columns_to_drop = ['free_ID', 'release', 'ethnicity', 'free_char_map1', 'free_char_map2', 'free_char_map3']
columns_to_drop_existing = [col for col in columns_to_drop if col in character_filtered.columns]

character_filtered.drop(columns_to_drop_existing, axis=1, inplace=True)

display(character_filtered.head())

# Verify the indexes are unique
print(f"Is the indexing unique ? {character_filtered.index.is_unique}")

Unnamed: 0_level_0,Unnamed: 1_level_0,DOB,gender,height,act_name,age_at_release
wiki_ID,char_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
975900,Akooshay,1958-08-26,F,1.62,Wanda De Jesus,42.0
975900,Lieutenant Melanie Ballard,1974-08-15,F,1.78,Natasha Henstridge,27.0
975900,Desolation Williams,1969-06-15,M,1.727,Ice Cube,32.0
975900,Sgt Jericho Butler,1967-09-12,M,1.75,Jason Statham,33.0
975900,Bashira Kincaid,1977-09-25,F,1.65,Clea DuVall,23.0


Is the indexing unique ? True


## New dataframe character name

lets create a new dataframe containing the character names with their respective movie.

In [59]:
name_df = character_filtered.reset_index()[['wiki_ID', 'char_name']].copy(deep=True)
display(name_df.head())

Unnamed: 0,wiki_ID,char_name
0,975900,Akooshay
1,975900,Lieutenant Melanie Ballard
2,975900,Desolation Williams
3,975900,Sgt Jericho Butler
4,975900,Bashira Kincaid


Lets split the character names and explode it

In [60]:
# Split the character names into words and explode the lists
exploded_name_df = name_df.assign(char_words=name_df['char_name'].str.split()).explode('char_words')
word_name_df = exploded_name_df.drop(columns=['char_name'])

word_name_df.head()

Unnamed: 0,wiki_ID,char_words
0,975900,Akooshay
1,975900,Lieutenant
1,975900,Melanie
1,975900,Ballard
2,975900,Desolation


### Filter the names to keep only the ones available in the baby name dataset

In [61]:
# does the word_name_df has duplicates?
duplicates = word_name_df.duplicated()
print(f"number of duplicates in word_name_df = {duplicates.sum()}")
word_name_df = word_name_df.drop_duplicates()
duplicates_after = word_name_df.duplicated()
print(f"number of duplicates in word_name_df = {duplicates_after.sum()}")

number of duplicates in word_name_df = 28595
number of duplicates in word_name_df = 0


In [62]:
baby_name_only_df = baby_name_df[['name']].copy(deep=True)

# does the baby_name_only_df has duplicates?
duplicates = baby_name_only_df.duplicated()
print(f"number of duplicates in baby_name_only_df = {duplicates.sum()}")
# drop these duplicates
baby_name_only_df = baby_name_only_df.drop_duplicates()
duplicates = baby_name_only_df.duplicated()
print(f"number of duplicates in baby_name_only_df = {duplicates.sum()}")

print("\nbaby_name_only_df :")
display(baby_name_only_df.head())

number of duplicates in baby_name_only_df = 1982709
number of duplicates in baby_name_only_df = 0

baby_name_only_df :


Unnamed: 0,name
0,Mary
1,Anna
2,Emma
3,Elizabeth
4,Minnie


In [63]:
# Use pd.merge to filter rows based on 'names' column
word_name_filtered_df = pd.merge(word_name_df.reset_index(), baby_name_only_df, left_on='char_words', right_on='name', how='inner')
# word_name_filtered_df = word_name_filtered_df.drop_duplicates()
# word_name_filtered_df.set_index(['wiki_ID', 'char_words'], inplace=True)
word_name_filtered_df.drop(columns=['index', 'name'], inplace=True)
word_name_filtered_df.set_index(['wiki_ID', 'char_words'], inplace=True)
print("word_name_filtered_df :")
display(word_name_filtered_df.head())

# Verify the indexes are unique
print(f"Is the indexing unique ? {word_name_filtered_df.index.is_unique}")

word_name_filtered_df :


wiki_ID,char_words
975900,Lieutenant
7668793,Lieutenant
24226493,Lieutenant
3388805,Lieutenant
8231713,Lieutenant


Is the indexing unique ? True


In [64]:
# check for the first movie of the CMU dataset, seems ok
word_name_filtered_df.loc[975900]

Lieutenant
Melanie
Ballard
Williams
Jericho
Butler
Bashira
Kincaid
Helena
Braddock
Daddy


The name Lieutenant is still in the filtered dataframe. Let's check if this name is in the baby name dataset

In [65]:
name_to_search = 'Lieutenant'

# Check if the name is present
is_name_present = name_to_search in baby_name_only_df['name'].values

if is_name_present:
    print(f"{name_to_search} is present in the DataFrame.")
else:
    print(f"{name_to_search} is not present in the DataFrame.")

Lieutenant is present in the DataFrame.


## Movie metadata preprocessing

In [66]:
movie_df.head()

Unnamed: 0,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


### Release date column

Lets compute the missing values and remove the NaN value in the release column

In [67]:
nb_release_missing = movie_df['release'].isna().sum()
total_movies = len(movie_df)
perc_missing = (nb_release_missing / total_movies)*100
print(f"Percentage of missing values in column 'release': {perc_missing:.2f}%")

Percentage of missing values in column 'release': 0.00%


In [68]:
print(f"Before removing NaN : len(movie_df) = {len(movie_df)}")
movie_df.dropna(subset=['release'], inplace=True)
print(f"After removing NaN : len(movie_df) = {len(movie_df)}")

Before removing NaN : len(movie_df) = 74839
After removing NaN : len(movie_df) = 74839


Let's see what are the format present in the release date column

In [69]:
counts_lengths = movie_df['release'].apply(lambda x: len(str(x))).value_counts()
print(f"In the release column of movie_df the string have the following lengths with their frequency : \n\n{counts_lengths}")

In the release column of movie_df the string have the following lengths with their frequency : 

release
10    39373
4     32172
7      3294
Name: count, dtype: int64


In [70]:
length4_test = movie_df[movie_df['release'].apply(lambda x: len(str(x)) == 4)].iloc[0]['release']
print(f"example of a value of length 4: {length4_test}")
length7_test = movie_df[movie_df['release'].apply(lambda x: len(str(x)) == 7)].iloc[0]['release']
print(f"example of a value of length 7: {length7_test}")
length10_test = movie_df[movie_df['release'].apply(lambda x: len(str(x)) == 10)].iloc[0]['release']
print(f"example of a value of length 10: {length10_test}")

example of a value of length 4: 1988
example of a value of length 7: 2003-10
example of a value of length 10: 2001-08-24


We should convert the release date to keep only the year as we have only the year in the baby name dataset

In [71]:
movie_datetime_df = movie_df.copy(deep=True)
print(f"Before conversion : type of the column release in movie_df is : {type(movie_datetime_df['release'].iloc[0])}")

# movie_datetime_df['release'] = movie_df['release'].apply(convert_to_datetime)
movie_datetime_df['release'] = movie_df['release'].apply(lambda x: str(x)[:4]).astype(np.int64)

filtered_movie_datetime_df = movie_datetime_df.dropna(subset=['release']).copy(deep=True)
filtered_movie_datetime_df = filtered_movie_datetime_df[filtered_movie_datetime_df['release'] >= 1800]
print(f"Number of rows dropped due to NaN value as release attribut : {len(movie_datetime_df) - len(filtered_movie_datetime_df)}")

print(f"After conversion : type of the column release in movie_df is : {type(filtered_movie_datetime_df['release'].iloc[0])}")

Before conversion : type of the column release in movie_df is : <class 'str'>
Number of rows dropped due to NaN value as release attribut : 1
After conversion : type of the column release in movie_df is : <class 'numpy.int64'>


Check for NaT values

In [72]:
nb_nat_values = filtered_movie_datetime_df['release'].isna().sum()
print(nb_nat_values)

0


In [73]:
filtered_movie_datetime_df.head()

Unnamed: 0,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [76]:
del movie_df
movie_df = filtered_movie_datetime_df.copy(deep=True)

### Add rating to movie_df

In [77]:
folder_processed_data_path = './processed_data/'

In [78]:
# Import idmb with wikidata index mapping
movies_to_imdb_id_df = pd.read_csv(folder_processed_data_path + 'movies_to_imdb_id.csv')
display(movies_to_imdb_id_df.head())

Unnamed: 0,wikipedia_ID,IMDB_ID
0,77856,tt0058331
1,156558,tt0255819
2,171005,tt0097499
3,175024,tt0020823
4,175026,tt0021335


Let's check if we have the rating of all the movies. To do so we carry out a left merge and count the number of missing values

In [79]:
left_merged_imdb_movie_df = pd.merge(movie_df.reset_index(), movies_to_imdb_id_df, left_on='wiki_ID', right_on='wikipedia_ID', how='left').copy(deep=True)
display(left_merged_imdb_movie_df.head(2))
print(f"length of the dataframe : {len(left_merged_imdb_movie_df)}")

Unnamed: 0,index,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres,wikipedia_ID,IMDB_ID
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",975900.0,tt0228333
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",3196793.0,tt0245916


length of the dataframe : 74838


In [80]:
nb_rating_missing = left_merged_imdb_movie_df['IMDB_ID'].isna().sum()
total_movies = len(left_merged_imdb_movie_df)
perc_missing = (nb_rating_missing / total_movies)*100
print(f"Percentage of missing values in column 'averageRating': {perc_missing:.2f}%")

Percentage of missing values in column 'averageRating': 5.05%


We miss about 5% of the IMDB index. Therefore, the number of movie is reduced of 5% as well. Let's make the inner merge to keep only the movie for which we know the rating

In [81]:
merged_imdb_movie_df = pd.merge(movie_df.reset_index(), movies_to_imdb_id_df, left_on='wiki_ID', right_on='wikipedia_ID', how='inner').copy(deep=True)
display(merged_imdb_movie_df.head(2))
print(f"length of the dataframe : {len(merged_imdb_movie_df)}")

Unnamed: 0,index,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres,wikipedia_ID,IMDB_ID
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",975900,tt0228333
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",3196793,tt0245916


length of the dataframe : 71062


In [82]:
merged_rating_movie_df = pd.merge(merged_imdb_movie_df, rating_df, left_on='IMDB_ID', right_on='tconst', how='inner').copy(deep=True)
display(merged_rating_movie_df.head(2))
print(f"length of the dataframe : {len(merged_rating_movie_df)}")

Unnamed: 0,index,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres,wikipedia_ID,IMDB_ID,averageRating,numVotes
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",975900,tt0228333,4.9,56880
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",3196793,tt0245916,6.0,69


length of the dataframe : 65501


Lets remove the columns that we don't need

In [83]:
movie_rating_df = merged_rating_movie_df.drop(['wikipedia_ID', 'IMDB_ID'], axis=1).copy(deep=True)
display(movie_rating_df.head(2))
print(f"length of the dataframe : {len(movie_rating_df)}")

Unnamed: 0,index,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres,averageRating,numVotes
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",4.9,56880
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",6.0,69


length of the dataframe : 65501


### Column genre

In the genre column we have dictionaries. Let's keep only the name of the genre

In [84]:
# Get unique values (excluding NaN)
unique_values = movie_df['genres'].unique()

# Print the unique values
print("Unique values:", unique_values)

Unique values: ['{"/m/01jfsb": "Thriller", "/m/06n90": "Science Fiction", "/m/03npn": "Horror", "/m/03k9fj": "Adventure", "/m/0fdjb": "Supernatural", "/m/02kdv5l": "Action", "/m/09zvmj": "Space western"}'
 '{"/m/02n4kr": "Mystery", "/m/03bxz7": "Biographical film", "/m/07s9rl0": "Drama", "/m/0hj3n01": "Crime Drama"}'
 '{"/m/0lsxr": "Crime Fiction", "/m/07s9rl0": "Drama"}' ...
 '{"/m/0lsxr": "Crime Fiction", "/m/01jfsb": "Thriller", "/m/01z4y": "Comedy", "/m/0fdjb": "Supernatural"}'
 '{"/m/01z02hx": "Sports", "/m/0lsxr": "Crime Fiction", "/m/02kdv5l": "Action", "/m/07s9rl0": "Drama"}'
 '{"/m/06n90": "Science Fiction", "/m/0gw5n2f": "Japanese Movies", "/m/03k9fj": "Adventure", "/m/0hcr": "Animation", "/m/02hmvc": "Short Film", "/m/0jxy": "Anime", "/m/07s9rl0": "Drama"}']


TODO : preprocess the genres in order to keep only the values and discard the freebase key

## Exportation

In [85]:
folder_processed_data_path = './processed_data/'

In [86]:
display(movie_rating_df.head())
# Export DataFrame to a CSV file in the processed data folder
movie_rating_df.to_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'), index=False)

Unnamed: 0,index,wiki_ID,free_ID,mov_name,release,revenue,runtime,languages,countries,genres,averageRating,numVotes
0,0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",4.9,56880
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",6.0,69
2,2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",5.6,40
3,3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",6.1,2891
4,4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",5.9,623


In [87]:
display(character_filtered.reset_index().head())
# Export DataFrame to a CSV file in the processed data folder
character_filtered.reset_index().to_csv(os.path.join(folder_processed_data_path, 'character_df.csv'), index=False)

Unnamed: 0,wiki_ID,char_name,DOB,gender,height,act_name,age_at_release
0,975900,Akooshay,1958-08-26,F,1.62,Wanda De Jesus,42.0
1,975900,Lieutenant Melanie Ballard,1974-08-15,F,1.78,Natasha Henstridge,27.0
2,975900,Desolation Williams,1969-06-15,M,1.727,Ice Cube,32.0
3,975900,Sgt Jericho Butler,1967-09-12,M,1.75,Jason Statham,33.0
4,975900,Bashira Kincaid,1977-09-25,F,1.65,Clea DuVall,23.0


In [88]:
display(word_name_filtered_df.reset_index().head())
# Export DataFrame to a CSV file in the processed data folder
word_name_filtered_df.reset_index().to_csv(os.path.join(folder_processed_data_path, 'name_by_movie_df.csv'), index=False)

Unnamed: 0,wiki_ID,char_words
0,975900,Lieutenant
1,7668793,Lieutenant
2,24226493,Lieutenant
3,3388805,Lieutenant
4,8231713,Lieutenant


In [89]:
display(baby_name_with_percentage_df.reset_index().head())
# Export DataFrame to a CSV file in the processed data folder
baby_name_with_percentage_df.reset_index().to_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'), index=False)

Unnamed: 0,name,year,number,percentage
0,Aaban,2007,5,0.000125
1,Aadam,2007,8,0.0002
2,Aadan,2007,8,0.0002
3,Aadarsh,2007,13,0.000325
4,Aaden,2007,157,0.003928


In [90]:
display(rating_df.reset_index().head())
# Export DataFrame to a CSV file in the processed data folder
rating_df.reset_index().to_csv(os.path.join(folder_processed_data_path, 'rating_df.csv'), index=False)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2004
1,tt0000002,5.8,269
2,tt0000003,6.5,1904
3,tt0000004,5.5,178
4,tt0000005,6.2,2685
