In [1]:
"""
File name: cleaning_data.py
Author: ImportNumpyAsPd
Date created: 05/11/2024
Date last modified: 05/11/2024
"""
#fichier final 

'\nFile name: cleaning_data.py\nAuthor: ImportNumpyAsPd\nDate created: 05/11/2024\nDate last modified: 05/11/2024\n'

In [2]:
# some basic imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#path data 
data_path_CMU = './CMU/'
data_path_IMDb = './IMDb/'

#Names of columns for the data CMU
NAMES_MOVIES = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name','Movie_release_date','Movie_box_office_revenue','Movie_runtime','Movie_languages','Movie_countries','Movie_genres']
NAMES_CHARACTER = ['Character_Name','Actor_DOB','Actor_gender','Actor_height','Actor_ethnicity','Actor_Name','Actor_age_at_movie_release','Freebase_character_actor_map_ID', 'Freebase_character_ID', 'Freebase_actor_ID']

#Names of columns for the data IMDb
NAMES_BASICS = ['IMDb_director_ID', 'Name_of_the_director']
NAMES_BASICS_2 = ['IMDb_title_ID', 'Type', 'Primary_title', 'Original_title']
NAMES_CREW = ['IMDb_title_ID','IMDb_director_ID', 'IMDb_writers_ID']
NAMES_NEW = ['IMDb_title_ID', 'Release_date','Runtime']

#Read data CMU 
df_CMU_character = pd.read_csv(data_path_CMU+'character.metadata.tsv', sep='\t', names = NAMES_CHARACTER, header = None)
df_CMU_movies = pd.read_csv(data_path_CMU+'movie.metadata.tsv', sep='\t', names = NAMES_MOVIES, header = None)

#Read data IMDb
df_IMDb_name = pd.read_csv(data_path_IMDb+'name.basics_Movie.tsv', sep='\t', names = NAMES_BASICS, header = None)
df_IMDb_basics = pd.read_csv(data_path_IMDb+'title.basics_Movie.tsv', sep='\t', names = NAMES_BASICS_2, header = None)
df_IMDb_crew = pd.read_csv(data_path_IMDb+'title.crew_Movie.tsv', sep='\t', names = NAMES_CREW, header = None)
df_IMDb_ratings = pd.read_csv(data_path_IMDb+'title.ratings.tsv', sep='\t')
df_IMDb_ratings.rename(columns={'tconst': 'IMDb_title_ID'}, inplace=True)
df_IMDb_new = pd.read_csv(data_path_IMDb+'title.basics_Thomas.tsv', sep='\t', names = NAMES_NEW, header = None)

In [4]:
#Name of the datasets
datasets = ['df_CMU_character','df_CMU_movies','df_IMDb_name','df_IMDb_basics','df_IMDb_crew',
            'df_IMDb_ratings', 'df_IMDb_new']
#Make it a dictionary
df_all = {'df_CMU_character': df_CMU_character,'df_CMU_movies': df_CMU_movies,
          'df_IMDb_name': df_IMDb_name,'df_IMDb_basics': df_IMDb_basics,
          'df_IMDb_crew': df_IMDb_crew,'df_IMDb_ratings': df_IMDb_ratings,'df_IMDb_new':df_IMDb_new}

In [5]:
# before cleaning the data, first look in detail to data

In [6]:
# printing the size and shape of the different dataframes :
for name, df_i in df_all.items():
    print(f"Name of the dataframe : {name}, its size : {df_i.size}, its shape : {df_i.shape}")

Name of the dataframe : df_CMU_character, its size : 4506690, its shape : (450669, 10)
Name of the dataframe : df_CMU_movies, its size : 735669, its shape : (81741, 9)
Name of the dataframe : df_IMDb_name, its size : 413024, its shape : (206512, 2)
Name of the dataframe : df_IMDb_basics, its size : 2783768, its shape : (695942, 4)
Name of the dataframe : df_IMDb_crew, its size : 2039205, its shape : (679735, 3)
Name of the dataframe : df_IMDb_ratings, its size : 4480260, its shape : (1493420, 3)
Name of the dataframe : df_IMDb_new, its size : 2088990, its shape : (696330, 3)


In [7]:
# printing the type in the different dataframes : 
for name, df_i in df_all.items():
    print(f"Types in the dataframe: {name}")
    print(df_i.dtypes)
    print("-----------------------------------------")

Types in the dataframe: df_CMU_character
Character_Name                      object
Actor_DOB                           object
Actor_gender                        object
Actor_height                       float64
Actor_ethnicity                     object
Actor_Name                          object
Actor_age_at_movie_release         float64
Freebase_character_actor_map_ID     object
Freebase_character_ID               object
Freebase_actor_ID                   object
dtype: object
-----------------------------------------
Types in the dataframe: df_CMU_movies
Wikipedia_movie_ID            int64
Freebase_movie_ID            object
Movie_name                   object
Movie_release_date           object
Movie_box_office_revenue    float64
Movie_runtime               float64
Movie_languages              object
Movie_countries              object
Movie_genres                 object
dtype: object
-----------------------------------------
Types in the dataframe: df_IMDb_name
IMDb_director_ID  

In [8]:
# look for any NaN values
print("Look for any NaN values")
for name, df_i in df_all.items():
    print(f"Name of the dataframe : {name}")
    print(df_i.isnull().sum())
    print("----------------------")

Look for any NaN values
Name of the dataframe : df_CMU_character
Character_Name                     257875
Actor_DOB                          106145
Actor_gender                        45609
Actor_height                       295845
Actor_ethnicity                    344611
Actor_Name                           1228
Actor_age_at_movie_release         158113
Freebase_character_actor_map_ID         0
Freebase_character_ID              257865
Freebase_actor_ID                     815
dtype: int64
----------------------
Name of the dataframe : df_CMU_movies
Wikipedia_movie_ID              0
Freebase_movie_ID               0
Movie_name                      0
Movie_release_date           6902
Movie_box_office_revenue    73340
Movie_runtime               20450
Movie_languages                 0
Movie_countries                 0
Movie_genres                    0
dtype: int64
----------------------
Name of the dataframe : df_IMDb_name
IMDb_director_ID        0
Name_of_the_director    0
dtype: int

In [9]:
#The fact the IMDb datasets do not have any NaN is strange, 
#for df_IMDb_name and df_IMDb_crew it is normal, since ID is unique
#for df_IMDb_basics, its seem that there is NaN value where it should be, just very low
#let's have a look
print("Looking for strange outcome in df_IMDb_name dataset")
i = df_IMDb_basics.Primary_title.value_counts()
print(i.index)

Looking for strange outcome in df_IMDb_name dataset
Index(['Broken', 'Home', 'Alone', 'Mother', 'Trapped', 'Homecoming', 'Hamlet',
       'Paradise', 'Love', 'Untitled',
       ...
       'Al eeteraf al akhir', 'Dreams Awake', 'Viking Warrior Women',
       'Csillag a máglyán', 'La colo', 'Il canto di Circe', 'Butch Jamie',
       'Buried in Tucson', 'The Big Year', 'Chico Albuquerque - Revelações'],
      dtype='object', name='Primary_title', length=598900)


In [10]:
#looks okay
#now for df_IMDb_ratings
print("Looking for strange outcome in df_IMDb_ratings dataset")
a = df_IMDb_ratings.averageRating.value_counts()
b = df_IMDb_ratings.numVotes.value_counts()
print("see averageRatings column")
print(a.index)
print("-----------------------")
print("numVotes column")
print(b.index)


Looking for strange outcome in df_IMDb_ratings dataset
see averageRatings column
Index([ 7.2,  7.4,  7.6,  7.8,  7.0,  7.5,  7.3,  8.0,  6.8,  7.7,  7.1,  7.9,
        8.2,  6.6,  6.9,  6.7,  8.1,  6.4,  6.2,  6.5,  6.3,  6.0,  8.3,  8.4,
        5.8,  6.1,  8.5,  8.6,  5.9,  5.6,  5.7,  8.7,  8.8,  5.4,  5.5,  5.2,
        5.3,  9.0,  5.0,  8.9,  5.1,  4.8,  9.2,  9.1,  4.9,  4.6,  4.7,  4.4,
        9.4,  4.5,  4.2,  9.3, 10.0,  4.3,  4.0,  4.1,  9.6,  9.5,  3.8,  3.9,
        3.6,  3.7,  9.8,  9.7,  3.4,  3.5,  3.2,  3.3,  3.0,  3.1,  2.8,  9.9,
        2.9,  2.7,  1.0,  2.6,  2.5,  2.4,  2.3,  2.2,  2.0,  2.1,  1.8,  1.9,
        1.7,  1.5,  1.2,  1.6,  1.4,  1.3,  1.1],
      dtype='float64', name='averageRating')
-----------------------
numVotes column
Index([     7,      8,      9,      6,     10,     11,     12,     13,      5,
           14,
       ...
        45291,  14961,  28910, 107406, 102425,  50145,  45412, 269095,  25650,
       138123],
      dtype='int64', name='numV

In [11]:
#dtype are int or float, looks good
#now lets see df_IMDb_new
print("Looking for strange outcome in df_IMDb_new dataset")
c = df_IMDb_new.Release_date.value_counts()
d = df_IMDb_new.Runtime.value_counts()
print("see averageRatings column")
print(c.index)
print("-----------------------")
print("numVotes column")
print(d.index)


Looking for strange outcome in df_IMDb_new dataset
see averageRatings column
Index(['\N', '2022', '2023', '2018', '2019', '2017', '2021', '2016', '2015',
       '2014',
       ...
       '2029', '1903', '1902', '1897', '1904', '2030', '1894', '1896', '2028',
       '2031'],
      dtype='object', name='Release_date', length=138)
-----------------------
numVotes column
Index(['\N', '90', '80', '60', '85', '95', '100', '88', '92', '93',
       ...
       '950', '468', '1320', '435', '28643', '1325', '384', '570', '368',
       '990'],
      dtype='object', name='Runtime', length=519)


In [12]:
# oh oh, we have dtype = object, where it should be int or float here, 
# and we see that there is \N that appears in the index -> looks like "false" NaN values, 
# so lets replace them by reel NaN values

# replace \N with NaN in df_IMDb_new
df_IMDb_new.replace(to_replace='\\N', value=np.nan, inplace=True)

In [13]:
# Did it worked ?
print(df_IMDb_new.isnull().sum())

IMDb_title_ID         0
Release_date     101163
Runtime          257955
dtype: int64


In [14]:
# Now, we can work on the data

In [15]:
# 1) 
# Let's cut a little bit our huge IMDb datasets
# First, we see that there is no movies in CMU that are above 2016 
df_CMU_movies[df_CMU_movies['Movie_release_date']>='2017']

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres


In [16]:
# So, lets cut IMDb_new for any movies above 2016
df_IMDb_date = df_IMDb_new[df_IMDb_new['Release_date']<='2016']
print(f"We have deleted : {df_IMDb_new.Release_date.shape[0] - df_IMDb_date.Release_date.shape[0]} nbr of rows")

We have deleted : 253208 nbr of rows


In [17]:
# 2) 
# Now, lets merge IMDb_date with IMDb_basics

# merge on IMDb title ID (by default)
df_IMDb_big = pd.merge(df_IMDb_date, df_IMDb_basics)
print(df_IMDb_date.shape[0]-df_IMDb_big.shape[0])
print(df_IMDb_basics.shape[0]-df_IMDb_big.shape[0])
print(f"Are they any duplicates ? If true it means no duplicated : {df_IMDb_big['IMDb_title_ID'].is_unique}")

90
252910
Are they any duplicates ? If true it means no duplicated : True


In [18]:
# See that we have lost a reasonable amount of film 
# the fact that the two dataset have not been downloaded at the same time 
# explains why 90+252910 != 253208

In [19]:
df_IMDb_big.sample(5)

Unnamed: 0,IMDb_title_ID,Release_date,Runtime,Type,Primary_title,Original_title
216439,tt0805625,2006,117,movie,Sun Scarred,Taiyô no kizu
120480,tt0200971,1962,109,movie,I Never Forgot You,Pote de se xehasa
432905,tt8020560,2011,46,movie,The Nip Tuck Trip,The Nip Tuck Trip
108591,tt0179072,1998,125,movie,Amazing Sex Talk,Amazing Sex Talk
26629,tt0038211,1945,84,movie,The Invisible Army,Den usynlige hær


In [20]:
df_IMDb_big['Type'].value_counts()

Type
movie    443032
Name: count, dtype: int64

In [21]:
# we can see that this column does not give any infos
# we can delete it
df_IMDb_big = df_IMDb_big.drop('Type', axis=1)
df_IMDb_big.sample(5)

Unnamed: 0,IMDb_title_ID,Release_date,Runtime,Primary_title,Original_title
439384,tt9232252,1926,,Shinshu danjî no ikî,Shinshu danjî no ikî
83456,tt0122482,1954,93.0,Et eventyr om tre,Et eventyr om tre
15082,tt0023646,1932,80.0,Uptown New York,Uptown New York
1110,tt0005464,1915,,Helene of the North,Helene of the North
238451,tt11422360,1970,,The City of Sin,Shahr-e gonah


In [22]:
# 3) Merge CMU movies with IMDb basic dataset

# Here, we will try to merge them based on the movie names 
# Since the movies names can be different from both database due to ponctuation or 
# style of writings, lets first try to modify the title of both dataset

In [23]:
# Make a function for that

# needed imports
import string

# Change title to put it in a other new column called "Movie_title"
def change_title(df, title_column='Movie_name') : 
    table = str.maketrans('', '', string.punctuation) 
    #make sur everything is in string
    df[title_column] = df[title_column].astype(str)
    df['Movie_title'] = df[title_column].apply(lambda r: r.lower().translate(table).replace(' ',''))
    return df

In [24]:
change_title(df_IMDb_big, 'Primary_title')
df_IMDb_big.sample(5)

Unnamed: 0,IMDb_title_ID,Release_date,Runtime,Primary_title,Original_title,Movie_title
395154,tt4800024,2015,52.0,"Hubert de Givenchy, un destin Haute Couture","Hubert de Givenchy, un destin Haute Couture",hubertdegivenchyundestinhautecouture
7624,tt0014355,1923,,Petit hôtel à louer,Petit hôtel à louer,petithôtelàlouer
177498,tt0349122,1989,,Bahurani,Bahurani,bahurani
393511,tt4700538,2015,100.0,Hollywood Road Trip,Hollywood Road Trip,hollywoodroadtrip
415440,tt6009274,1976,,Bondini,Bondini,bondini


In [25]:
change_title(df_CMU_movies, 'Movie_name')
df_CMU_movies.sample(5)

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Movie_title
64156,15325765,/m/03m506k,Third Man on the Mountain,1959-11-10,,107.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama"", ""/m/0hqxf"": ""Family Fi...",thirdmanonthemountain
5618,16168156,/m/03wbs0z,Kudaikul Mazhai,2004,,,"{""/m/07c9s"": ""Tamil Language""}","{""/m/03rk0"": ""India""}","{""/m/02l7c8"": ""Romance Film""}",kudaikulmazhai
22417,15210434,/m/03hlzbp,Where in the World is Osama Bin Laden?,2008-04-18,384955.0,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0cshrf"": ""Political cinema"", ""/m/0hj3n07""...",whereintheworldisosamabinladen
80249,9796711,/m/02psgy6,Seven Years in Tibet,1997-09-13,131457682.0,136.0,"{""/m/01kbdv"": ""Tibetan languages"", ""/m/02h40lc...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/017fp"": ""Biography"", ""/m/03k9fj"": ""Advent...",sevenyearsintibet
61077,5315403,/m/0df3kl,Full Circle,1996,,22.3,{},{},"{""/m/02hmvc"": ""Short Film"", ""/m/0jtdp"": ""Docum...",fullcircle


In [26]:
# Now, merge them on 'Movie_title' and drop the new column since not useful
merge_CMU_IMDb = pd.merge(df_CMU_movies, df_IMDb_big, on = 'Movie_title').drop('Movie_title', axis=1)

In [27]:
# Now, lets see what we got 

print(f"The shape of the CMU dataset was: {df_CMU_movies.shape[0]}, after merging we have : {merge_CMU_IMDb.shape[0]}.")
print(f"Is the merge files contains only unique films ? : {merge_CMU_IMDb['Wikipedia_movie_ID'].is_unique}")
print("If false, mean that there is duplicates")

The shape of the CMU dataset was: 81741, after merging we have : 116158.
Is the merge files contains only unique films ? : False
If false, mean that there is duplicates


In [28]:
# look for duplicates 
duplicates = merge_CMU_IMDb[merge_CMU_IMDb.duplicated(subset = 'Wikipedia_movie_ID', keep = False)]
duplicates.shape


(76194, 14)

In [29]:
# we have a lot of duplicates, pas ouf ouf 

# before dealing with duplicates, that are stored, lets remove them from the 
# datasets, so we have at least something to work on

first_dataset = merge_CMU_IMDb.drop_duplicates(['Wikipedia_movie_ID'], keep = False)
first_dataset['Wikipedia_movie_ID'].is_unique

True

In [30]:
first_dataset.shape

(39964, 14)

In [31]:
first_dataset.sample(3)

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,IMDb_title_ID,Release_date,Runtime,Primary_title,Original_title
57933,9104281,/m/027xrtf,Casanova Brown,1944,,94.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""...",tt0036699,1944,94,Casanova Brown,Casanova Brown
72883,2448004,/m/07dsw1,The Day the Clown Cried,1972,,,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",tt0068451,1972,90,The Day the Clown Cried,The Day the Clown Cried
52138,1631192,/m/05j2wd,Billy the Kid and the Green Baize Vampire,1985,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04t36"": ""Musical"", ""/m/01lrrt"": ""Melodram...",tt0088807,1985,93,Billy the Kid and the Green Baize Vampire,Billy the Kid and the Green Baize Vampire


In [32]:
# bon, on a perdu la moitié du dataset mais pour l'instant je peux
# rien faire de plus, j'ai des idées mais j'ai plus le temps
# je vous laisse travailler avec ce 'petit' dataset

In [33]:
# juste quelques manips pour le rendre plus joli (j avais mis les runtime et tout pour
# faire des comparaisons, mais vu que j'ai pas le temps je les tej du dataframe pour 
# l'instant, mais ca devait etre utile a la base)

In [34]:
"""
# tej des colonnes inutiles car redondantes
column_to_drop = ['Release_date','Runtime','Primary_title','Original_title']
for i in column_to_drop : 
    first_dataset = first_dataset.drop(i, axis =1)
first_dataset.sample(2)
"""

"\n# tej des colonnes inutiles car redondantes\ncolumn_to_drop = ['Release_date','Runtime','Primary_title','Original_title']\nfor i in column_to_drop : \n    first_dataset = first_dataset.drop(i, axis =1)\nfirst_dataset.sample(2)\n"

In [35]:
# mtn vous avez le CMU corpus avec le film IMDb ID en plus, vous pouvez 
# donc faire des liens avec le reste des datasets df_(IMDb_crew, IMDb_ratings et IMDb_name)

In [36]:
first_dataset['IMDb_title_ID'].is_unique

False

In [37]:
#dupli = duplicates.copy()
#dupli['Runtime'] = dupli['Runtime'].astype(float)


In [38]:
"""
test2 = pd.DataFrame({'value':[32000219, 14443291, 25325943, 31316162, 5187766, 22804261, 30473086, 18732394,14904858]})

test4=pd.DataFrame()
count = 0
no_match_found = []
no_match_found_empty = []
for i in test2['value']:
    test3 = pd.DataFrame()
    if not duplicates[duplicates['Wikipedia_movie_ID']==i].empty :
        for index, row in dupli[dupli['Wikipedia_movie_ID']==i].iterrows() :
            year_d = pd.to_datetime(row['Movie_release_date'], errors = 'coerce')
            year_x = pd.to_datetime(row['Release_date'], errors = 'coerce')
            no_match = row['Wikipedia_movie_ID']
            if str(row['Movie_runtime'])==str(row['Runtime']) and year_d.year==year_x.year :
                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True)
                break
            elif year_d.year==year_x.year :
                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True) 
        if test3.empty :
            no_match_found_empty.append(no_match)
        elif not test3['Wikipedia_movie_ID'].is_unique :
            test3 = pd.DataFrame()
            no_match_found.append(no_match)
    else : 
        count +=1
        
    test4 = pd.concat([test4, test3], ignore_index=True)


print(count)
print(no_match_found)
print(no_match_found_empty)
test4.head(10)
"""

"\ntest2 = pd.DataFrame({'value':[32000219, 14443291, 25325943, 31316162, 5187766, 22804261, 30473086, 18732394,14904858]})\n\ntest4=pd.DataFrame()\ncount = 0\nno_match_found = []\nno_match_found_empty = []\nfor i in test2['value']:\n    test3 = pd.DataFrame()\n    if not duplicates[duplicates['Wikipedia_movie_ID']==i].empty :\n        for index, row in dupli[dupli['Wikipedia_movie_ID']==i].iterrows() :\n            year_d = pd.to_datetime(row['Movie_release_date'], errors = 'coerce')\n            year_x = pd.to_datetime(row['Release_date'], errors = 'coerce')\n            no_match = row['Wikipedia_movie_ID']\n            if str(row['Movie_runtime'])==str(row['Runtime']) and year_d.year==year_x.year :\n                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True)\n                break\n            elif year_d.year==year_x.year :\n                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True) \n        if test3.empty :\n            no_match_found_empt

In [39]:

# Loop finale
test=pd.DataFrame()
osefe = 0
count = 0
no_match_found_wiki = []
no_match_found_empty_wiki = []
no_match_found_imdb = []
no_match_found_empty_imdb = []
for i in df_CMU_movies['Wikipedia_movie_ID']:
    test2 = pd.DataFrame()
    test3 = pd.DataFrame()
    if not duplicates[duplicates['Wikipedia_movie_ID']==i].empty :
        for index, row in duplicates[duplicates['Wikipedia_movie_ID']==i].iterrows() :
            year_d = pd.to_datetime(row['Movie_release_date'], errors = 'coerce')
            year_x = pd.to_datetime(row['Release_date'], errors = 'coerce')
            no_match_wiki = row['Wikipedia_movie_ID']
            no_match_imdb = row['IMDb_title_ID']
            if str(float(row['Movie_runtime']))==str(float(row['Runtime'])) and year_d.year==year_x.year :
                test = pd.concat([test, row.to_frame().T], ignore_index=True)
                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True)
                break
            elif year_d.year==year_x.year :
                test2 = pd.concat([test2, row.to_frame().T], ignore_index=True) 
        if test2.empty and test3.empty :
            no_match_found_empty_wiki.append(no_match_wiki)
            no_match_found_empty_imdb.append(no_match_imdb)
        elif test2.empty : 
            osefe+=1
        elif not test3.empty : 
            test2 = pd.DataFrame()
        elif not test2['Wikipedia_movie_ID'].is_unique and test3.empty:
            test2 = pd.DataFrame()
            no_match_found_wiki.append(no_match_wiki)
            no_match_found_imdb.append(no_match_imdb)
        test = pd.concat([test, test2], ignore_index=True)
    else : 
        count +=1


In [40]:
print(count)
print(len(no_match_found_wiki))
print(len(no_match_found_empty_wiki))
print(len(no_match_found_imdb))
print(len(no_match_found_empty_imdb))
print(test.shape)
test.head(10)


64225
262
4391
262
4391
(12863, 14)


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,IMDb_title_ID,Release_date,Runtime,Primary_title,Original_title
0,171005,/m/016ywb,Henry V,1989-11-08,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",tt0097499,1989,137.0,Henry V,Henry V
1,31983669,/m/0g4_n3m,Road to Life,1931-09-30,,104.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/05vz3zq"": ""Soviet Union""}","{""/m/07s9rl0"": ""Drama""}",tt0022289,1931,105.0,Road to Life,Putyovka v zhizn
2,156558,/m/014k4y,Baby Boy,2001-06-27,29381649.0,123.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",tt0255819,2001,130.0,Baby Boy,Baby Boy
3,26878691,/m/0f400r,Mysterious Island,1982,,100.0,"{""/m/0653m"": ""Standard Mandarin""}","{""/m/03h64"": ""Hong Kong""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/08322"":...",tt0084373,1982,,Mysterious Island,Mysterious Island
4,11633165,/m/02rm6l8,Innocence,1997,,110.0,"{""/m/02hwyss"": ""Turkish Language""}","{""/m/01znc_"": ""Turkey""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",tt0128332,1997,110.0,Innocence,Masumiyet
5,12053509,/m/02vn81r,Loverboy,1989-04-28,3960327.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/02l7c8"": ""...",tt0097790,1989,98.0,Loverboy,Loverboy
6,29198000,/m/0dll97s,Chandra Mukhi,1993-10-22,,,"{""/m/03k50"": ""Hindi Language""}",{},"{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D...",tt0106541,1993,152.0,Chandra Mukhi,Chandra Mukhi
7,5915856,/m/0fdc85,Ivar,2003-10,,,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/0n6m8vw"": ""Malayalam Cinema"", ""/m/02kdv5l...",tt0378118,2003,,Ivar,Ivar
8,2647998,/m/07v6f3,The Human Tornado,1976,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/0hj3l_y...",tt0074653,1976,96.0,The Human Tornado,The Human Tornado
9,1254263,/m/04mhwd,Shivers,1975-10-10,,89.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0d060g"": ""Canada""}","{""/m/06n90"": ""Science Fiction"", ""/m/03npn"": ""H...",tt0073705,1975,87.0,Shivers,Shivers


In [42]:
#On refait la même pour les IMDb duplicates

In [43]:
duplicates_2 = first_dataset[first_dataset.duplicated(subset = 'IMDb_title_ID', keep = False)]
duplicates_3 = test[test.duplicated(subset = 'IMDb_title_ID', keep = False)]

In [44]:
print(duplicates_2.shape)
print(duplicates_3.shape)

(1259, 14)
(101, 14)


In [46]:
duplicates_2 = pd.concat([duplicates_2, duplicates_3], ignore_index=True)
print(duplicates_2.shape)

(1360, 14)


In [47]:
#mtn on drop les duplicates IMDb 
second_dataset = first_dataset.drop_duplicates(['IMDb_title_ID'], keep = False)
second_dataset.shape

(38705, 14)

In [48]:
#mtn on drop les duplicates IMDb 
second_test = test.drop_duplicates(['IMDb_title_ID'], keep = False)
second_test.shape

(12762, 14)

In [49]:
second_test['IMDb_title_ID'].is_unique

True

In [50]:
#on fait la même 

# Loop finale
test_2=pd.DataFrame()
count_2 = 0
osef = 0
no_match_found_2_wiki = []
no_match_found_empty_2_wiki = []
no_match_found_2_imdb = []
no_match_found_empty_2_imdb = []
for i in df_IMDb_big['IMDb_title_ID']:
    test2 = pd.DataFrame()
    test3 = pd.DataFrame()
    if not duplicates_2[duplicates_2['IMDb_title_ID']==i].empty :
        for index, row in duplicates_2[duplicates_2['IMDb_title_ID']==i].iterrows() :
            year_d = pd.to_datetime(row['Movie_release_date'], errors = 'coerce')
            year_x = pd.to_datetime(row['Release_date'], errors = 'coerce')
            no_match_wiki = row['Wikipedia_movie_ID']
            no_match_imdb = row['IMDb_title_ID']
            if str(float(row['Movie_runtime']))==str(float(row['Runtime'])) and year_d.year==year_x.year :
                test_2 = pd.concat([test_2, row.to_frame().T], ignore_index=True)
                test3 = pd.concat([test3, row.to_frame().T], ignore_index=True)
                break
            elif year_d.year==year_x.year :
                test2 = pd.concat([test2, row.to_frame().T], ignore_index=True)
        if test2.empty and test3.empty :
            no_match_found_empty_2_wiki.append(no_match_wiki)
            no_match_found_empty_2_imdb.append(no_match_imdb)
        elif test2.empty : 
            osef+=1
        elif not test3.empty: 
            test2 = pd.DataFrame()
        elif not test2['IMDb_title_ID'].is_unique and test3.empty:
            test2 = pd.DataFrame()
            no_match_found_2_wiki.append(no_match_wiki)
            no_match_found_2_imdb.append(no_match_imdb)
        test_2 = pd.concat([test_2, test2], ignore_index=True)
    else : 
        count_2 +=1

In [51]:
print(count_2)
print(len(no_match_found_2_wiki))
print(len(no_match_found_empty_2_wiki))
print(len(no_match_found_2_imdb))
print(len(no_match_found_empty_2_imdb))
print(test_2.shape)
test_2.head(10)

442370
29
99
29
99
(534, 14)


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,IMDb_title_ID,Release_date,Runtime,Primary_title,Original_title
0,18037145,/m/047sr_y,The Battle of Gettysburg,1913-06-01,,50.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/06ppq"": ""Silent film"", ""/m/07s9rl0"": ""Dra...",tt0002669,1913,48.0,The Battle of Gettysburg,The Battle of Gettysburg
1,32172436,/m/0gwygmm,Enoch Arden,1914,,,{},{},"{""/m/07s9rl0"": ""Drama""}",tt0003886,1914,,Enoch Arden,Enoch Arden
2,15130491,/m/03hjgx1,The Man Without a Country,1917-09-09,,60.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama"", ""/m/0219x_"": ""Indie""}",tt0008259,1917,60.0,The Man Without a Country,The Man Without a Country
3,14335129,/m/03d0whx,Back to the Woods,1918-11,,,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",tt0008861,1918,50.0,Back to the Woods,Back to the Woods
4,4791224,/m/0cnh99,Tarzan of the Apes,1918,,73.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06ppq"": ""Silent film"", ""/m/02kdv5l"": ""Act...",tt0009682,1918,73.0,Tarzan of the Apes,Tarzan of the Apes
5,26943525,/m/0bm8_br,Under the Greenwood Tree,1918-12-08,,,"{""/m/06ppq"": ""Silent film""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06ppq"": ""Silent film""}",tt0009744,1918,50.0,Under the Greenwood Tree,Under the Greenwood Tree
6,13690044,/m/03cf5nt,J'accuse!,1919-04-25,,166.0,"{""/m/064_8sq"": ""French Language"", ""/m/06ppq"": ...","{""/m/0f8l9c"": ""France""}","{""/m/0jb4p32"": ""Zombie Film"", ""/m/06ppq"": ""Sil...",tt0010307,1919,166.0,J'accuse!,J'accuse
7,5307081,/m/0ddp_r,The Lost Battalion,1919-07-02,,,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06ppq"": ""Silent film"", ""/m/07s9rl0"": ""Dra...",tt0010386,1919,70.0,The Lost Battalion,The Lost Battalion
8,34459451,/m/0h_c4hx,Bleak House,1920-01-15,,,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/06ppq"": ""Sil...",tt0011001,1920,,Bleak House,Bleak House
9,33654948,/m/0hgn1yv,The Elusive Pimpernel,1919-12,,,{},"{""/m/07ssc"": ""United Kingdom""}","{""/m/06ppq"": ""Silent film"", ""/m/07s9rl0"": ""Dra...",tt0011149,1919,,The Elusive Pimpernel,The Elusive Pimpernel


In [52]:
# add new dataset in first_merge: 

#on verifie que tout est ok avant de merge 

In [60]:
print(f"Tout est ok avec second_test ? {second_test['Wikipedia_movie_ID'].is_unique} and {second_test['IMDb_title_ID'].is_unique}, il a une taille de {second_test.shape[0]}")
print(f"Tout est ok avec test_2 ? {test_2['Wikipedia_movie_ID'].is_unique} and {test_2['IMDb_title_ID'].is_unique}, il a une taille de {test_2.shape[0]}")
print(f"Tout est ok avec second_dataset ? {second_dataset['Wikipedia_movie_ID'].is_unique} and {second_dataset['IMDb_title_ID'].is_unique}, il a une taille de {second_dataset.shape[0]}")

Tout est ok avec second_test ? True and True, il a une taille de 12762
Tout est ok avec test_2 ? True and True, il a une taille de 534
Tout est ok avec second_dataset ? True and True, il a une taille de 38705


In [61]:
#add them to our big dataset 
final_dataset = pd.DataFrame()
final_dataset = pd.concat([final_dataset, second_dataset], ignore_index=True)
final_dataset = pd.concat([final_dataset, second_test], ignore_index=True)
final_dataset = pd.concat([final_dataset, test_2], ignore_index=True)
print(f"Tout est ok avec final_dataset ? {final_dataset['Wikipedia_movie_ID'].is_unique} and {final_dataset['IMDb_title_ID'].is_unique}, il a une taille de {final_dataset.shape[0]}")

Tout est ok avec final_dataset ? True and True, il a une taille de 52001


In [78]:
# petit résumé de ce que j'ai fait : 
print(f"Au premier merge, j'avais un dataset de taille : {merge_CMU_IMDb.shape[0]}, qui contenait des duplicatas et il n'était pas unique")
print(f"Si je supprimais tous les duplicatas j'avais un dataset de {second_dataset.shape[0]}")
print(f"Dans le process, j'ai du éliminer {len(no_match_found_2_wiki)+len(no_match_found_wiki)+len(no_match_found_empty_wiki)+len(no_match_found_empty_2_wiki)} lignes, mais j'ai gagné {test_2.shape[0]+second_test.shape[0]} lignes")
print(f"Au final, mon dataset a une taille de {final_dataset.shape[0]}, et au départ le dataset CMU a une taille de {df_CMU_movies.shape[0]}")
print(f"On a donc perdu environ : {100-(final_dataset.shape[0]/df_CMU_movies.shape[0])*100}, %, ce qui est mieux que >50%")
print(f"Si on ajoute a notre taille finale la taille de ce que l'on a du enlever, on obtient que : {final_dataset.shape[0]+len(no_match_found_2_wiki)+len(no_match_found_wiki)+len(no_match_found_empty_wiki)+len(no_match_found_empty_2_wiki)}, ce qui veut dire qu'il y a eu {100-((final_dataset.shape[0]+len(no_match_found_2_wiki)+len(no_match_found_wiki)+len(no_match_found_empty_wiki)+len(no_match_found_empty_2_wiki))/df_CMU_movies.shape[0])*100} % de perte que car il y a des noms de films qui ne matchent pas, voir discussion ci dessous")

Au premier merge, j'avais un dataset de taille : 116158, qui contenait des duplicatas et il n'était pas unique
Si je supprimais tous les duplicatas j'avais un dataset de 38705
Dans le process, j'ai du éliminer 4781 lignes, mais j'ai gagné 13296 lignes
Au final, mon dataset a une taille de 52001, et au départ le dataset CMU a une taille de 81741
On a donc perdu environ : 36.383210384017815, %, ce qui est mieux que >50%
Si on ajoute a notre taille finale la taille de ce que l'on a du enlever, on obtient que : 56782, ce qui veut dire qu'il y a eu 30.534248418786163 % de perte que car il y a des noms de films qui ne matchent pas, voir discussion ci dessous


In [None]:
# ok, j'arrive a mon max, j'ai du tej des lignes dans mes boucles 
# car ca veut dire que pour un nom de film similaire, les deux 
# datasets CMU et IMDb ont des films différents, mais pas le même 
# (ou en tout cas pas similaire au niveau de la date et runtime, ce qui est très bizarre 
# si on perd des films comme cela c'est la vie) 
# l'autre gross perte correspond au fait que lors du premier merge, il y a des noms de 
# films qui ne matchent pas les deux données, et c'est la qu'on perd le plus et je pense
# que on peut gagner le plus si qqn trouve une idée de génie
# par exemple, j'ai remarqué que CMU donne se titre de film 
#"Indiana Jones and the Raiders of the Lost Ark"
# alors que IMDb donne celui la 
# "The Raiders of the Lost Ark" 
# du coup si qqn trouve une idée pour les mettres ensemble je suis preneur, 
# j'ai pensé au query ou de mémoire, avec des mots cléfs il peut retrouver, donc si 
# on transforme les datasets ca peut etre une bonne idée, a voir si possible 
# une autre idée serait de faire du ML, avec le final_dataset pour train les datas et ensuite 
# on lui demande de merger ce qui reste, ca peut etre pas dégueu, de nouveau a voir la 
# feasabilité 