In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests

## DATA SET MERGED

In this Jupyter Lab notebook, all necessary data from external datasets will be merged with the given dataset to support the analysis.


In [4]:
# Loading the original dataset 
data_folder = '/Users/mariacherchouri/Documents/MA3/ADA/ADA2024/ProjetADA/Data/'

# Headers 
column_names = ['Wiki_ID', 'Movie_ID', 'title','release_date', 'BoxOfficeRevenue', 'Runtime', 'Languages', 'Countries', 'Genres' ]

df = pd.read_csv(data_folder + 'movie.metadata.tsv', sep='\t', names=column_names, header=None)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mariacherchouri/Documents/MA3/ADA/ADA2024/ProjetADA/Data/movie.metadata.tsv'

In [None]:
df.head(2)

Unnamed: 0,Wiki_ID,Freebase_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


The values in the columns "Languages," "Countries," and "Genres" are structured as dictionaries—for example, `{"/m/02h40lc": "English Language"}`, where `"/m/02h40lc"` is the key and `"English Language"` is the value. To simplify future use of these values, they will be converted into lists of strings.

In [None]:
# Extract the values inside the second set of quotes for each key-value pair
def extract_values(_str):
    return re.findall(r': "([^"]+)"', _str)

# Apply the extraction function to the relevant columns
df['Languages'] = df['Languages'].apply(extract_values)
df['Countries'] = df['Countries'].apply(extract_values)
df['Genres'] = df['Genres'].apply(extract_values)

# Displaying the DataFrame to see the result
df.head(2)

Unnamed: 0,Wiki_ID,Freebase_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"


# IMDb Datasets

The 'title.ratings.tsv' and 'title.akas.tsv' datasets come from the IMDb Non-Commercial Datasets, available at https://developer.imdb.com/non-commercial-datasets/. 

The 'title.ratings.tsv' file contains:
- tconst (string): alphanumeric unique identifier of the title
- averageRating: weighted average of all the individual user ratings
- numVotes: number of votes the title has received

and the 'title.akas.tsv' file contains:

- titleId (string): a tconst, an alphanumeric unique identifier of the title
- ordering (integer): a number to uniquely identify rows for a given titleId
- title (string): the localized title
- region (string):the region for this version of the title
- language (string): the language of the title
- types (array): Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array): Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean): 0: not original title; 1: original title,

These datasets must be merged to connect the 'title' with its corresponding 'averageRating' using the 'titleId'/'tconst'. Once merged, this data will be integrated into the initial dataset. 
IMDb rating data is essential for creating a metric called 'SuccessMetric,' which will be used to classify a movie's success.

In [None]:
#Load IMDb Datasets
data_folder = '/Users/mariacherchouri/Documents/MA3/ADA/ADA2024/ProjetADA/Data/'
df_rating = pd.read_csv(data_folder + 'title.ratings.tsv', sep='\t')
df_rating_title = pd.read_csv(data_folder + 'title.akas.tsv', sep='\t')

In [None]:
# Displaying the DataFrame
df_rating.head(2)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2096
1,tt0000002,5.6,282


In [None]:
#Displaying the DataFrame
df_rating_title.head(2)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,\N,\N,original,\N,1
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0


In [None]:
df_rating = df_rating.rename(columns={'tconst': 'titleId'})
df_rating.head(2)

Unnamed: 0,titleId,averageRating,numVotes
0,tt0000001,5.7,2096
1,tt0000002,5.6,282


In [None]:
# Check if there are duplicate movies with the same title ID
# This ensures that each title ID is associated with only one movie entry
df_rating_title[df_rating_title['titleId']=='tt0228333']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
1493874,tt0228333,1,Ghosts of Mars,\N,\N,original,\N,1
1493875,tt0228333,10,Ghosts of Mars,SE,\N,imdbDisplay,\N,0
1493876,tt0228333,11,Ghosts of Mars,US,\N,imdbDisplay,\N,0
1493877,tt0228333,12,Ghosts of Mars,ZA,en,imdbDisplay,\N,0
1493878,tt0228333,13,Fantasmas de Marte,BR,\N,\N,\N,0
1493879,tt0228333,14,Fantasmas de Marte,MX,\N,\N,\N,0
1493880,tt0228333,15,Fantasmas de Marte,PE,\N,imdbDisplay,\N,0
1493881,tt0228333,16,Fantasmas de Marte,US,es,imdbDisplay,\N,0
1493882,tt0228333,17,A Mars szelleme,HU,\N,\N,\N,0
1493883,tt0228333,18,Apeili ston Ari,GR,\N,\N,transliterated title,0


In [None]:
# For each title ID, keep only the original title.
df_rating_title=df_rating_title[df_rating_title['isOriginalTitle'] == 1.0]

In [None]:
# Count the occurrences of each unique value in the title 
value_count = df_rating_title['title'].value_counts()
print(value_count)

title
Episode #1.1                   53140
Episode #1.2                   47989
Episode #1.3                   45430
Episode #1.4                   42165
Episode #1.5                   38656
                               ...  
Astronomy in World Religion        1
Does Hers Like Pumpkin?            1
Sticking to the Script             1
Slung Your Time                    1
Horrid Henry Knows It All          1
Name: count, Length: 5043401, dtype: int64


It seems that there are still duplicates...

In [None]:
# Count the occurrences of each unique value in the titleId
value_count1 = df_rating_title['titleId'].value_counts()
print(value_count1)

titleId
tt3880980     2
tt0000001     1
tt31107355    1
tt31107345    1
tt31107346    1
             ..
tt15067248    1
tt1506725     1
tt15067252    1
tt15067254    1
tt9916880     1
Name: count, Length: 11178939, dtype: int64


Apart from one movie, the duplicates appear to be based on the title rather than the title ID. 
This suggests that they could be different movies with the same title, rather than exact duplicates. 

Let's focus on observing the exception.

In [None]:
df_rating_title[df_rating_title['titleId'] == 'tt3880980']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
38672995,tt3880980,1,Deadly Dreams,\N,\N,original,\N,1
38672996,tt3880980,2,マックのハッスル刑事\tJP\tja\timdbDisplay\t\N\t0\ntt3880...,\N,\N,original,\N,1


It seems to be the same movie in this case, but with different titles considered as each the original title.

In [None]:
#Merged title and rates
df_merged = pd.merge(df_rating, df_rating_title, on='titleId', how='inner')

In [None]:
df_merged

Unnamed: 0,titleId,averageRating,numVotes,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,5.7,2096,1,Carmencita,\N,\N,original,\N,1
1,tt0000002,5.6,282,1,Le clown et ses chiens,\N,\N,original,\N,1
2,tt0000003,6.5,2114,1,Pauvre Pierrot,\N,\N,original,\N,1
3,tt0000004,5.4,182,1,Un bon bock,\N,\N,original,\N,1
4,tt0000005,6.2,2844,1,Blacksmith Scene,\N,\N,original,\N,1
...,...,...,...,...,...,...,...,...,...,...
1492858,tt9916730,7.0,12,1,6 Gunn,\N,\N,original,\N,1
1492859,tt9916766,7.1,24,1,Episode #10.15,\N,\N,original,\N,1
1492860,tt9916778,7.2,37,1,Escape,\N,\N,original,\N,1
1492861,tt9916840,6.9,11,1,Horrid Henry's Comic Caper,\N,\N,original,\N,1


In [None]:
value_count1 = df_merged['titleId'].value_counts()
print(value_count1)

titleId
tt0000001     1
tt2565978     1
tt2565972     1
tt2565964     1
tt2565962     1
             ..
tt1055718     1
tt1055717     1
tt10557162    1
tt1055716     1
tt9916880     1
Name: count, Length: 1492863, dtype: int64


There are no more duplicates based on the title ID. However, to differentiate between movies with the same title, additional information that varies between these movies is needed.   
The release year will be used for this purpose.

To achieve this, the 'title.basics.tsv'file from the same IMDb webpage as before will be used to add the release year.  
The 'title.basics.tsv' file contains:
- tconst (string): alphanumeric unique identifier of the title
- titleType (string): the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string): the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string): original title, in the original language
- isAdult (boolean): 0: non-adult title; 1: adult title
- startYear (YYYY): represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY): TV Series end year. '\N' for all other title types
- runtimeMinutes: primary runtime of the title, in minutes
- genres (string array): includes up to three genres associated with the title

In [None]:
# Load the new dataset
df_date = pd.read_csv(data_folder + 'title.basics.tsv', sep='\t')

  df_date = pd.read_csv(data_folder + 'title.basics.tsv', sep='\t')


In [None]:
# Merged the two datasets
df_date = df_date.rename(columns={'tconst': 'titleId'})
df_merged_final = pd.merge(df_merged, df_date, on='titleId', how='inner')

In [None]:
# Observed the final dataset 
df_merged_final.head(2)

Unnamed: 0,titleId,averageRating,numVotes,ordering,title,region,language,types,attributes,isOriginalTitle,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,5.7,2096,1,Carmencita,\N,\N,original,\N,1,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,5.6,282,1,Le clown et ses chiens,\N,\N,original,\N,1,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"


The final IMDb dataset will now be merged with the original dataset, utilizing the normalized title and release year to avoid any confusion.

In [None]:
df['Normalized_Title'] = df['title'].str.replace(" ", "").str.lower()
df_merged_final['Normalized_Title'] = df_merged_final['title'].str.replace(" ", "").str.lower()

In [None]:
df['releaseYear'] = df['release_date'].astype(str).str[:4]
df_merged_final['releaseYear'] = df_merged_final['startYear'].astype(str).str[:4]

In [None]:
df_merged_final['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvEpisode', 'tvSeries',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame'], dtype=object)

Examining the 'titleType' category reveals entries labeled as 'videoGame' and 'tvSpecial'.  
Since these are not considered cinema categories, they will be removed from the dataset.

In [None]:
df_merged_final = df_merged_final[~df_merged_final['titleType'].isin(['videoGame', 'tvSpecial'])]

In [None]:
# Merged the original dataset with the treated IMDb dataset
df_IMDb = pd.merge(df, df_merged_final , on=['Normalized_Title', 'releaseYear'], how='inner')

In [None]:
# Check for duplicate titles
print(df_IMDb['title_x'].value_counts())

title_x
Home                       23
Love                       21
Legacy                     19
Alice in Wonderland        18
Macbeth                    18
                           ..
Naan                        1
Muqaddar Ka Badshaah        1
The Vanishing Rider         1
The Great Train Robbery     1
Another Nice Mess           1
Name: count, Length: 43429, dtype: int64


In [None]:
#Verify for one duplicate title whether duplicates also exist for the release year.
df_IMDb[df_IMDb['title_x'] == 'Macbeth']

Unnamed: 0,Wiki_ID,Freebase_ID,title_x,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,Normalized_Title,...,attributes,isOriginalTitle,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
5202,16786364,/m/04062sb,Macbeth,1954-11-28,,103.0,[English Language],"[United States of America, United Kingdom]",[Comedy-drama],macbeth,...,\N,1,tvMovie,Macbeth,Macbeth,0,1954,\N,103,Drama
14810,15097368,/m/03hh969,Macbeth,1909-11-27,,16.0,[Italian Language],[Italy],"[Short Film, Silent film]",macbeth,...,\N,1,short,Macbeth,Macbeth,0,1909,\N,\N,"Drama,Short"
14811,15097368,/m/03hh969,Macbeth,1909-11-27,,16.0,[Italian Language],[Italy],"[Short Film, Silent film]",macbeth,...,\N,1,short,Macbeth,Macbeth,0,1909,\N,\N,"Drama,Short"
20206,15097470,/m/03hh99d,Macbeth,1913,,,[],[Germany],"[Silent film, Indie]",macbeth,...,\N,1,movie,Macbeth,Macbeth,0,1913,\N,50,"Crime,Drama"
20855,6652508,/m/0gg5jd,Macbeth,2006-09-21,,109.0,[English Language],"[Australia, New Zealand]","[Crime Fiction, Thriller, Indie, World cinema,...",macbeth,...,\N,1,movie,Macbeth,Macbeth,0,2006,\N,109,"Crime,Drama,Fantasy"
20856,6652508,/m/0gg5jd,Macbeth,2006-09-21,,109.0,[English Language],"[Australia, New Zealand]","[Crime Fiction, Thriller, Indie, World cinema,...",macbeth,...,\N,1,movie,Macbeth,Macbeth,0,2006,\N,118,Drama
20857,6652508,/m/0gg5jd,Macbeth,2006-09-21,,109.0,[English Language],"[Australia, New Zealand]","[Crime Fiction, Thriller, Indie, World cinema,...",macbeth,...,\N,1,tvMovie,Macbeth,Macbeth,0,2006,\N,156,Music
24236,1966825,/m/069mfr,Macbeth,1948-10-01,,107.0,[English Language],[United States of America],"[Costume drama, Drama]",macbeth,...,\N,1,movie,Macbeth,Macbeth,0,1948,\N,107,"Drama,History,War"
30944,533378,/m/02mfrg,Macbeth,1971-10-13,,141.0,[English Language],"[United States of America, United Kingdom]","[Costume drama, Drama, Film adaptation]",macbeth,...,\N,1,tvMovie,Macbeth,Macbeth,0,1971,\N,60,Drama
32865,15097405,/m/03hh97b,Macbeth,1911-04,,14.0,[Silent film],[United Kingdom],"[Short Film, Silent film, Drama, Indie]",macbeth,...,\N,1,short,Macbeth,Macbeth,0,1911,\N,14,"Drama,Short"


Duplicate entries for year and title were observed, corresponding to different ratings, possibly from different webpages for the same movie. To address this, a weighted average will be calculated.

In [None]:
# Compute the average rating
df_IMDb['weighted_rating'] = df_IMDb['averageRating'] * df_IMDb['numVotes']

df_IMDb_f = df_IMDb.groupby(['Normalized_Title', 'releaseYear']).agg(
    Wiki_ID = ('Wiki_ID', 'first'),
    Freebase_ID = ('Freebase_ID', 'first'),
    title=('title_x', 'first'),  
    totRating=('weighted_rating', 'sum'),
    numVotes=('numVotes', 'sum'),
    release_date = ('release_date', 'first'),
    Runtime =  ('Runtime', 'first'),
    BoxOfficeRevenue = ('BoxOfficeRevenue', 'first'),
    Languages = ('Languages', 'first' ),
    Countries = ('Countries', 'first'),
    Genres=('Genres', 'first')

).reset_index()

df_IMDb_f['Rating'] = df_IMDb_f['totRating'] / df_IMDb_f['numVotes']

df_IMDb_f= df_IMDb_f.drop(columns=['totRating'])

In [None]:
df_IMDb_f.head(2)

Unnamed: 0,Normalized_Title,releaseYear,Wiki_ID,Freebase_ID,title,numVotes,release_date,Runtime,BoxOfficeRevenue,Languages,Countries,Genres,Rating
0,#1cheerleadercamp,2010,30332673,/m/0crs0hx,#1 Cheerleader Camp,3424,2010,90.0,,[],[United States of America],"[Sports, Sex comedy, Comedy film, Comedy, Teen]",3.7
1,$,1971,4213160,/m/0bq8q8,$,2948,1971-12-17,119.0,,[English Language],[United States of America],"[Crime Fiction, Heist, Action/Adventure, Thril...",6.3


In [None]:
# Save the DataFrame to a new CSV file
df_IMDb_f.to_csv('IMDb.csv', index=False)

# BechdelTest Dataset

The 'df_bechdeltest' DataFrame is sourced from http://bechdeltest.com/api/v1/getAllMovies Web page, which provides data on movies rated based on the Bechdel Test.

The Bechdel Test is a measure of the representation of women in films. A movie passes the test if it meets three criteria:
1. It features at least two named female characters,
2. who talk to each other,
3. about something other than a man.

While passing the Bechdel Test doesn’t fully capture the complexity of female representation, it offers a useful baseline for understanding gender dynamics in cinema.
This dataset will be used to observe whether the roles of women in films are significant and to analyze how this representation has evolved over the years.


In [None]:
url = 'http://bechdeltest.com/api/v1/getAllMovies'

response = requests.get(url)

if response.status_code == 200:
    print('Request was succesful')
    movie_data = response.json()
    df_bechdel = pd.DataFrame(movie_data)
    print(df_bechdel.sample())

else:
    print('Request failed')

Request was succesful
        id title  rating   imdbid  year
7886  5862  Fury       1  2713180  2014


In [None]:
df_bechdel.sample(2)

Unnamed: 0,id,title,rating,imdbid,year
9672,9008,Birds of Prey,3,7713068,2020
5755,3711,Defiance,3,1034303,2008


In [None]:
df_bechdel['Normalized_Title'] = df_bechdel['title'].str.replace(" ", "").str.lower()

In [None]:
df['year'] = df['release_date'].astype(str).str[:4]

In [None]:
df_bechdel['year'] = df_bechdel['year'].astype(str)

In [None]:
# Merge the original dataset with the Bechdel Test rating dataset 
df_bech_merged = pd.merge(df_bechdel,df , on=['Normalized_Title', 'year'], how='inner')

In [None]:
# Save the DataFrame to a new CSV file
df_bech_merged.to_csv('bechdeltest.csv', index=False)

The IMDb rating dataset is merged with the Bechdel Test dataset to combine information about movie ratings with data on gender representation in films. 

This allows for a more comprehensive analysis of both the quality of the movies and their portrayal of female characters, based on the Bechdel Test criteria.


In [None]:
df_imdb_bech = pd.read_csv(data_folder + 'IMDb_data.csv')

In [None]:
df_imdb_bech = df_imdb_bech.rename(columns={'Year': 'year'})

In [None]:
df_imdb_bech['year'] = df_imdb_bech['year'].astype(str)

In [None]:
# Merge the IMDb dataset with the Bechdel Test rating dataset 
df_bech_imdb_merged = pd.merge(df_bechdel,df_imdb_bech, on=['Normalized_Title', 'year'], how='inner')

In [None]:
# Rename and remove some columns to make the DataFrame clearer
df_bech_imdb_merged = df_bech_imdb_merged.rename(columns={'rating': 'ratingBechtest'})
df_bech_imdb_merged = df_bech_imdb_merged.loc[:, ~df_bech_imdb_merged.columns.str.endswith('_x')]
df_bech_imdb_merged.columns = df_bech_imdb_merged.columns.str.replace('_y$', '', regex=True)

In [5]:
# Save the DataFrame to a new CSV file
df_bech_imdb_merged.to_csv('whole_data.csv', index=False)

NameError: name 'df_bech_imdb_merged' is not defined

Mahdi's part

In [None]:
# In order to get the information on the budget of movie, we searched for a dataset online containing this information.
# We found on this website : https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset a dataset that might be a good fit 
# This dataset is a file called movies_metadata_TMDB.csv and its collumns contains information on 45,000 movies featured in the Full MovieLens dataset.
# Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.
budget_data = pd.read_csv(data_folder+'movies_metadata_TMDB.csv')

# We can drop duplicates by Wiki_ID in our original dataset since it will bias the results
df= df.drop_duplicates(subset='Wiki_ID')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mariacherchouri/Documents/MA3/ADA/ADA2024/ProjetADA/Data/movies_metadata_TMDB.csv'

In [None]:
# remove the lines where the budget is equal to 0
budget_data['budget'] = pd.to_numeric(budget_data['budget'], errors='coerce')
budget_data_clean = budget_data[budget_data['budget'].notna() & (budget_data['budget'] != 0)]

# removing the duplicates from this dataset also by the imdb_ID
budget_data_clean = budget_data_clean.drop_duplicates(subset='imdb_id')

budget_data_clean.head()

In [None]:
# We want first of all to ensure that the release_date column is in string format for both datasets
df['release_date'] = df['release_date'].astype(str)
budget_data_clean['release_date'] = budget_data_clean['release_date'].astype(str)

# Extracting the year part safely, cleaning it for the merge
df['Year'] = df['release_date'].str[:4]
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')

budget_data_clean['Year'] = budget_data_clean['release_date'].str[:4]
budget_data_clean['Year'] = pd.to_numeric(budget_data_clean['Year'], errors='coerce').astype('Int64')

In [None]:
# now we want to merge the two data sets by the normalized title of the movie and the normalized release year, 
# then add the budget + vote_average to the movie dataset


df['Normalized_Title'] = df['title'].str.replace(" ", "").str.lower()
budget_data_clean['Normalized_Title'] = budget_data_clean['title'].str.replace(" ", "").str.lower()


merged_data_budget = df.merge(budget_data_clean[['Normalized_Title', 'Year', 'budget','vote_average']], 
                              on=['Normalized_Title', 'Year'], 
                              how='inner')


merged_data_budget.shape

In [None]:
# Now we use the IMDb dataset to get the ratings 
# since we also have ratings in the budget dataset, we take the mean of both of them 

df_IMDb.rename(columns={'BoxOfficeRevenue':'BoxOfficeRevenueImdb'}, inplace=True)


df_IMDb['Normalized_Title'] = df_IMDb['title'].str.replace(" ", "").str.lower()



merged_data_imdb = merged_data_budget.merge(df_IMDb[['Normalized_Title','BoxOfficeRevenueImdb','Rating','Year']],
                                            on=['Normalized_Title','Year'], 
                                            how='inner')

merged_data_imdb.shape

In [None]:
# renaming 
merged_data_imdb_clean = merged_data_imdb

# now for all the lines where there is a value for BoxOfficeRevenueImdb and not for BoxOfficeRevenue we will copy the value from BoxOfficeRevenueImdb to BoxOfficeRevenue and vice versa
merged_data_imdb_clean['BoxOfficeRevenue'] = merged_data_imdb_clean['BoxOfficeRevenue'].fillna(merged_data_imdb_clean['BoxOfficeRevenueImdb'])
merged_data_imdb_clean['BoxOfficeRevenueImdb'] = merged_data_imdb_clean['BoxOfficeRevenueImdb'].fillna(merged_data_imdb_clean['BoxOfficeRevenue'])

# now for all the lines where there is a value for BoxOfficeRevenueImdb and for BoxOfficeRevenue we will calculate the difference in percentage between the two values and put it in a new column called BoxOfficeRevenueDifference
def percentage_difference(X, Y):                                           

    return np.abs((X-Y))/np.maximum(X,Y) * 100

merged_data_imdb_clean['BoxOfficeRevenueDifference'] = percentage_difference(merged_data_imdb_clean['BoxOfficeRevenue'], merged_data_imdb_clean['BoxOfficeRevenueImdb'])

# print the mean of the BoxOfficeRevenueDifference column
# we do this step to ensure that there is no big difference between the revenues in both datasets 

print(merged_data_imdb_clean['BoxOfficeRevenueDifference'].mean())


merged_data_imdb_clean.sample(5)

In [None]:
# Removing again every duplicate line with the help of the Wiki_ID column
merged_data_imdb_clean = merged_data_imdb_clean.drop_duplicates(subset='Wiki_ID')


# adding a new column called Net_revenue that is  basically the difference between the BoxOfficeRevenue and the budget
merged_data_imdb_clean['Net_revenue'] = merged_data_imdb_clean['BoxOfficeRevenue'] - merged_data_imdb_clean['budget']


merged_data_imdb_clean.sample(5)

In [None]:

# Creating a new column 'pondered_rating' as the mean of 'Rating' and 'vote_average'
#merged_data_imdb_clean['pondered_rating'] = merged_data_imdb_clean[['Rating', 'vote_average']].mean(axis=1)

# removing a few useless columns for the final dataframe. 
#merged_data_imdb_clean.drop(columns=['vote_average', 'Rating'], inplace=True)
#merged_data_imdb_clean.drop(columns=['BoxOfficeRevenueImdb', 'Normalized_Title', 'BoxOfficeRevenueDifference'], inplace=True)


merged_data_imdb_clean.shape

#merged_data_imdb_clean.to_csv('movie_data_successmetric_rating.csv', index=False)