## Merge of the dataset of oscar and movies budet/popu/rating 
Oscar dataset from kaggle : https://www.kaggle.com/datasets/unanimad/the-oscar-award

IMDb Data with ratings, popularity, budget, revenue etc : https://datasets.imdbws.com/


In [2]:
import pandas as pd

In [3]:
# oscars dataset : drop useless columns, change name to match the other datasets, keep only 4 categories of nominations
oscars = pd.read_csv('data/the_oscar_award.csv')
oscars = oscars[oscars['category'].str.contains('ACTOR|ACTOR IN A SUPPORTING ROLE|ACTRESS|ACTRESS IN A SUPPORTING ROLE|DIRECTING', regex=True)]
oscars.rename(columns={'film': 'movie_name'}, inplace=True)
oscars.rename(columns={'year_film': 'Movie release date'}, inplace=True)
oscars

Unnamed: 0,Movie release date,year_ceremony,ceremony,category,name,movie_name,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10800,2023,2024,96,DIRECTING,Justine Triet,Anatomy of a Fall,False
10801,2023,2024,96,DIRECTING,Martin Scorsese,Killers of the Flower Moon,False
10802,2023,2024,96,DIRECTING,Christopher Nolan,Oppenheimer,True
10803,2023,2024,96,DIRECTING,Yorgos Lanthimos,Poor Things,False


In [4]:
# need to have only one row per movie: addition of a 'num_nominations' column to count the number of nominations per movie, 
# and a 'winner' column to know if the movie won the category and by who --> will be matched with the actors we are interested in

oscars_to_merge = oscars.groupby(['movie_name'], as_index=False).agg({
    'Movie release date': 'first',
    'category': lambda x: ', '.join(f"{cat}: {name}" for cat, name in zip(x, oscars.loc[x.index, 'name'])),
    'winner': lambda x: ', '.join(oscars.loc[x.index, 'name'][oscars.loc[x.index, 'winner']]),
    'category': 'count'
}).rename(columns={'category': 'num_nominations'})

oscars_to_merge

Unnamed: 0,movie_name,Movie release date,num_nominations,winner
0,'Round Midnight,1986,1,
1,...And Justice for All,1979,1,
2,12 Angry Men,1957,1,
3,12 Monkeys,1995,1,
4,12 Years a Slave,2013,4,Lupita Nyong'o
...,...,...,...,...
1298,You're a Big Boy Now,1966,1,
1299,Z,1969,1,
1300,Zero Dark Thirty,2012,1,
1301,Zorba the Greek,1964,3,Lila Kedrova


In [5]:
# tmdb dataset : drop useless columns, change name to match the other datasets

data_tmdb = pd.read_csv('data/movie_data_tmbd.csv', sep='|')
data_tmdb.drop(['adult', 'id', 'imdb_id', 'backdrop_path', 'belongs_to_collection', 'genres', 'homepage', 'original_language', 'original_title', 'overview', 'runtime', 'spoken_languages', 'video', 'poster_path', 'production_companies', 'production_countries', 'tagline'], axis=1, inplace=True)

data_tmdb.rename(columns={'revenue': 'revenue_tmdb'}, inplace=True)
data_tmdb.rename(columns={'title': 'movie_name'}, inplace=True)
data_tmdb.rename(columns={'release_date': 'Movie release date'}, inplace=True)

data_tmdb[['movie_name', 'Movie release date']]

Unnamed: 0,movie_name,Movie release date
0,The Elusive Corporal,1962-05-23
1,Sundays and Cybele,1962-11-12
2,Lonely Are the Brave,1962-05-24
3,F for Fake,1975-03-12
4,Long Day's Journey Into Night,1962-10-09
...,...,...
119933,The Enemies,2017-05-16
119934,The Last Rodriguez,
119935,Space,2020-03-31
119936,Room 441,2020-02-14


In [6]:
# CMU Movie dataset : kept all columns, change name to match the other datasets

original_data = pd.read_csv('data/movie.metadata.tsv', sep='\t', names= ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres'])
original_data.rename(columns={'Movie name': 'movie_name'}, inplace=True)

original_data[['movie_name', 'Movie release date']]


Unnamed: 0,movie_name,Movie release date
0,Ghosts of Mars,2001-08-24
1,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16
2,Brun bitter,1988
3,White Of The Eye,1987
4,A Woman in Flames,1983
...,...,...
81736,Mermaids: The Body Found,2011-03-19
81737,Knuckle,2011-01-21
81738,Another Nice Mess,1972-09-22
81739,The Super Dimension Fortress Macross II: Lover...,1992-05-21


In [None]:
# merge the 3 datasets always on the original_data dataset rows

merged_data = original_data.merge(data_tmdb, on=['movie_name', 'Movie release date'], how='left')
final_merged_data = merged_data.merge(oscars_to_merge, on='movie_name', how='left')

final_merged_data.to_csv('data/final_dataset.csv', index=False)




(1303, 4)
(119938, 10)
(81741, 9)
(81742, 17)
(81742, 20)
