In [1]:
"""
File name: cleaning_data.py
Author: ImportNumpyAsPd
Date created: 05/11/2024
Date last modified: 05/11/2024
"""
#fichier final 

'\nFile name: cleaning_data.py\nAuthor: ImportNumpyAsPd\nDate created: 05/11/2024\nDate last modified: 05/11/2024\n'

In [2]:
# some basic imports
import os
import pandas as pd
import numpy as np

In [3]:
#path data 
data_path_CMU = './CMU/'
data_path_IMDb = './IMDb/'

#Names of columns for the data CMU
NAMES_MOVIES = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date','Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']
NAMES_CHARACTER = ['Character_Name','Actor_DOB','Actor_gender','Actor_height','Actor_ethnicity','Actor_Name','Actor_age_at_movie_release','Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']

#Names of columns for the data IMDb
NAMES_BASICS = ['IMDb_director_ID', 'Name_of_the_director']
NAMES_BASICS_2 = ['IMDb_title_ID', 'Type', 'Primary_title', 'Original_title']
NAMES_CREW = ['IMDb_title_ID','IMDb_director_ID', 'IMDb_writers_ID']
NAMES_NEW = ['IMDb_title_ID', 'Release_date','Runtime']

#Read data CMU 
df_CMU_character = pd.read_csv(data_path_CMU+'character.metadata.tsv', sep='\t', names = NAMES_CHARACTER, header = None)
df_CMU_movies = pd.read_csv(data_path_CMU+'movie.metadata.tsv', sep='\t', names = NAMES_MOVIES, header = None)

#Read data IMDb
df_IMDb_name = pd.read_csv(data_path_IMDb+'name.basics_Movie.tsv', sep='\t', names = NAMES_BASICS, header = None)
df_IMDb_basics = pd.read_csv(data_path_IMDb+'title.basics_Movie.tsv', sep='\t', names = NAMES_BASICS_2, header = None)
df_IMDb_crew = pd.read_csv(data_path_IMDb+'title.crew_Movie.tsv', sep='\t', names = NAMES_CREW, header = None)
df_IMDb_ratings = pd.read_csv(data_path_IMDb+'title.ratings.tsv', sep='\t')
df_IMDb_ratings.rename(columns={'tconst': 'IMDb title ID'}, inplace=True)
df_IMDb_new = pd.read_csv(data_path_IMDb+'title.basics_Thomas.tsv', sep='\t', names = NAMES_NEW, header = None)

In [4]:
#Name of the datasets
datasets = ['df_CMU_character','df_CMU_movies','df_IMDb_name','df_IMDb_basics','df_IMDb_crew',
            'df_IMDb_ratings', 'df_IMDb_new']
#Make it a dictionary
df_all = {'df_CMU_character': df_CMU_character,'df_CMU_movies': df_CMU_movies,
          'df_IMDb_name': df_IMDb_name,'df_IMDb_basics': df_IMDb_basics,
          'df_IMDb_crew': df_IMDb_crew,'df_IMDb_ratings': df_IMDb_ratings,'df_IMDb_new':df_IMDb_new}

In [5]:
# before cleaning the data, first look in detail to data

In [6]:
# printing the size and shape of the different dataframes :
for name, df_i in df_all.items():
    print(f"Name of the dataframe : {name}, its size : {df_i.size}, its shape : {df_i.shape}")

Name of the dataframe : df_CMU_character, its size : 4506690, its shape : (450669, 10)
Name of the dataframe : df_CMU_movies, its size : 735669, its shape : (81741, 9)
Name of the dataframe : df_IMDb_name, its size : 413024, its shape : (206512, 2)
Name of the dataframe : df_IMDb_basics, its size : 2783768, its shape : (695942, 4)
Name of the dataframe : df_IMDb_crew, its size : 2039205, its shape : (679735, 3)
Name of the dataframe : df_IMDb_ratings, its size : 4480260, its shape : (1493420, 3)
Name of the dataframe : df_IMDb_new, its size : 2088990, its shape : (696330, 3)


In [7]:
# printing the type in the different dataframes : 
for name, df_i in df_all.items():
    print(f"Types in the dataframe: {name}")
    print(df_i.dtypes)
    print("-----------------------------------------")

Types in the dataframe: df_CMU_character
Character_Name                      object
Actor_DOB                           object
Actor_gender                        object
Actor_height                       float64
Actor_ethnicity                     object
Actor_Name                          object
Actor_age_at_movie_release         float64
Freebase_character_actor_map_ID     object
Freebase_character_ID               object
Freebase_actor_ID                   object
dtype: object
-----------------------------------------
Types in the dataframe: df_CMU_movies
Wikipedia_movie_ID            int64
Freebase_movie_ID            object
Movie_name                   object
Movie_release_date           object
Movie_box_office_revenue    float64
Movie_runtime               float64
Movie_languages              object
Movie_countries              object
Movie_genres                 object
dtype: object
-----------------------------------------
Types in the dataframe: df_IMDb_name
IMDb_director_ID  

In [8]:
# look for any NaN values
print("Look for any NaN values")
for name, df_i in df_all.items():
    print(f"Name of the dataframe : {name}")
    print(df_i.isnull().sum())
    print("----------------------")

Look for any NaN values
Name of the dataframe : df_CMU_character
Character_Name                     257875
Actor_DOB                          106145
Actor_gender                        45609
Actor_height                       295845
Actor_ethnicity                    344611
Actor_Name                           1228
Actor_age_at_movie_release         158113
Freebase_character_actor_map_ID         0
Freebase_character_ID              257865
Freebase_actor_ID                     815
dtype: int64
----------------------
Name of the dataframe : df_CMU_movies
Wikipedia_movie_ID              0
Freebase_movie_ID               0
Movie_name                      0
Movie_release_date           6902
Movie_box_office_revenue    73340
Movie_runtime               20450
Movie_languages                 0
Movie_countries                 0
Movie_genres                    0
dtype: int64
----------------------
Name of the dataframe : df_IMDb_name
IMDb_director_ID        0
Name_of_the_director    0
dtype: int

In [9]:
#The fact the IMDb datasets do not have any NaN is strange, 
#for df_IMDb_name and df_IMDb_crew it is normal, since ID is unique
#for df_IMDb_basics, its seem that there is NaN value where it should be, just very low
#let's have a look
print("Looking for strange outcome in df_IMDb_name dataset")
i = df_IMDb_basics.Primary_title.value_counts()
print(i.index)

Looking for strange outcome in df_IMDb_name dataset
Index(['Broken', 'Home', 'Alone', 'Mother', 'Trapped', 'Homecoming', 'Hamlet',
       'Paradise', 'Love', 'Untitled',
       ...
       'Al eeteraf al akhir', 'Dreams Awake', 'Viking Warrior Women',
       'Csillag a máglyán', 'La colo', 'Il canto di Circe', 'Butch Jamie',
       'Buried in Tucson', 'The Big Year', 'Chico Albuquerque - Revelações'],
      dtype='object', name='Primary_title', length=598900)


In [10]:
#looks okay
#now for df_IMDb_ratings
print("Looking for strange outcome in df_IMDb_ratings dataset")
a = df_IMDb_ratings.averageRating.value_counts()
b = df_IMDb_ratings.numVotes.value_counts()
print("see averageRatings column")
print(a.index)
print("-----------------------")
print("numVotes column")
print(b.index)


Looking for strange outcome in df_IMDb_ratings dataset
see averageRatings column
Index([ 7.2,  7.4,  7.6,  7.8,  7.0,  7.5,  7.3,  8.0,  6.8,  7.7,  7.1,  7.9,
        8.2,  6.6,  6.9,  6.7,  8.1,  6.4,  6.2,  6.5,  6.3,  6.0,  8.3,  8.4,
        5.8,  6.1,  8.5,  8.6,  5.9,  5.6,  5.7,  8.7,  8.8,  5.4,  5.5,  5.2,
        5.3,  9.0,  5.0,  8.9,  5.1,  4.8,  9.2,  9.1,  4.9,  4.6,  4.7,  4.4,
        9.4,  4.5,  4.2,  9.3, 10.0,  4.3,  4.0,  4.1,  9.6,  9.5,  3.8,  3.9,
        3.6,  3.7,  9.8,  9.7,  3.4,  3.5,  3.2,  3.3,  3.0,  3.1,  2.8,  9.9,
        2.9,  2.7,  1.0,  2.6,  2.5,  2.4,  2.3,  2.2,  2.0,  2.1,  1.8,  1.9,
        1.7,  1.5,  1.2,  1.6,  1.4,  1.3,  1.1],
      dtype='float64', name='averageRating')
-----------------------
numVotes column
Index([     7,      8,      9,      6,     10,     11,     12,     13,      5,
           14,
       ...
        45291,  14961,  28910, 107406, 102425,  50145,  45412, 269095,  25650,
       138123],
      dtype='int64', name='numV

In [11]:
#dtype are int or float, looks good
#now lets see df_IMDb_new
print("Looking for strange outcome in df_IMDb_new dataset")
c = df_IMDb_new.Release_date.value_counts()
d = df_IMDb_new.Runtime.value_counts()
print("see averageRatings column")
print(c.index)
print("-----------------------")
print("numVotes column")
print(d.index)


Looking for strange outcome in df_IMDb_new dataset
see averageRatings column
Index(['\N', '2022', '2023', '2018', '2019', '2017', '2021', '2016', '2015',
       '2014',
       ...
       '2029', '1903', '1902', '1897', '1904', '2030', '1894', '1896', '2028',
       '2031'],
      dtype='object', name='Release_date', length=138)
-----------------------
numVotes column
Index(['\N', '90', '80', '60', '85', '95', '100', '88', '92', '93',
       ...
       '950', '468', '1320', '435', '28643', '1325', '384', '570', '368',
       '990'],
      dtype='object', name='Runtime', length=519)


In [12]:
# oh oh, we have dtype = object, where it should be int or float here, 
# and we see that there is \N that appears in the index -> looks like "false" NaN values, 
# so lets replace them by reel NaN values

# replace \N with NaN in df_IMDb_new
df_IMDb_new.replace(to_replace='\\N', value=np.nan, inplace=True)

In [13]:
# Did it worked ?
print(df_IMDb_new.isnull().sum())

IMDb_title_ID         0
Release_date     101163
Runtime          257955
dtype: int64


In [14]:
# Now, we can work on the data

In [16]:
# 1) 
# Let's cut a little bit our huge IMDb datasets
# First, we see that there is no movies in CMU that are above 2016 
df_CMU_movies[df_CMU_movies['Movie_release_date']>='2017']

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres


In [26]:
# So, lets cut IMDb_new for any movies above 2016
df_IMDb_date = df_IMDb_new[df_IMDb_new['Release_date']<='2016']
print(f"We have deleted : {df_IMDb_new.Release_date.shape[0] - df_IMDb_date.Release_date.shape[0]} nbr of rows")

We have deleted : 253208 nbr of rows


In [33]:
# 2) 
# Now, lets merge IMDb_date with IMDb_basics

# merge on IMDb title ID (by default)
df_IMDb_basic = pd.merge(df_IMDb_date, df_IMDb_basics)
print(df_IMDb_date.shape[0]-df_IMDb_basic.shape[0])
print(df_IMDb_basics.shape[0]-df_IMDb_basic.shape[0])
print(f"Are they any duplicates ? If true it means no duplicated : {df_IMDb_basic['IMDb_title_ID'].is_unique}")

90
252910
Are they any duplicates ? If true it means no duplicated : True


In [None]:
# See that we have lost a reasonable amount of film 
# the fact that the two dataset have not been downloaded at the same time 
# explains why 90+252910 != 253208

In [None]:
# 3) Merge CMU movies with IMDb basic dataset

# Here, we will try to merge them based on the movie names 
# Since the movies names can be different from both database due to ponctuation or 
# style of writings, lets first try to modify the title of both dataset to merge them on