In [None]:
# AI chat in this link: https://gemini.google.com/share/e62a45d84646

In [None]:
import pandas as pd

# 1. Load the datasets
# Assuming the files are in the current working directory
oscar_df = pd.read_csv('the_oscar_nominees.csv')
imdb_df = pd.read_csv('imdb_top_1000.csv')

# 2. Filter Oscar nominees for movies from 1990 to the present
# We create a copy to avoid SettingWithCopy warnings
oscar_filtered = oscar_df[oscar_df['year_film'] >= 1990].copy()

# 3. Select the specific columns from the Oscar nominees list
oscar_subset = oscar_filtered[['year_film', 'release_date', 'year_ceremony', 'film', 'winner']]

# 4. Select the specific columns from the IMDB list
# We include 'Series_Title' to perform the merge
imdb_subset = imdb_df[['Series_Title', 'Genre', 'Runtime', 'Meta_score']]

# 5. Merge the lists
# We use a 'left' join to keep all Oscar nominees.
# Matching is done on the movie title ('film' in Oscar list, 'Series_Title' in IMDB list).
merged_df = pd.merge(oscar_subset, imdb_subset, left_on='film', right_on='Series_Title', how='left')

# 6. Clean up
# Remove the extra title column from IMDB
merged_df = merged_df.drop(columns=['Series_Title'])

# Display the first few rows to verify
print(merged_df.head())

# Save the result to a new CSV file
merged_df.to_csv('merged_oscar_imdb.csv', index=False)

In [1]:
# I insert the missing data manually and created 'best_picture_merged_complete.csv'

import pandas as pd

# Load the datasets
main_df = pd.read_csv('best_picture_merged_complete.csv')
oscar_df = pd.read_csv('the_oscar_nominees.csv')

# 1. Identify relevant categories in the Oscar nominees list
director_cats = ['DIRECTING', 'BEST DIRECTOR']
picture_cats = ['BEST PICTURE']

# 2. Extract nominees for Picture and Director to sets for efficient lookup
# We use a tuple of (film_name, ceremony_year) to ensure uniqueness
bp_nominees = oscar_df[oscar_df['category'].isin(picture_cats)][['film', 'year_ceremony']]
dir_nominees = oscar_df[oscar_df['category'].isin(director_cats)][['film', 'year_ceremony']]

bp_set = set(zip(bp_nominees['film'], bp_nominees['year_ceremony']))
dir_set = set(zip(dir_nominees['film'], dir_nominees['year_ceremony']))

# 3. Define a function to check if a movie is in both sets
def check_both(row):
    # Create the key from the row in the main dataframe
    key = (row['film'], row['year_ceremony'])

    is_bp = key in bp_set
    is_dir = key in dir_set

    return is_bp and is_dir

# 4. Apply the check to create the new column
main_df['Nominated_Both_Director_and_Picture'] = main_df.apply(check_both, axis=1)

# 5. Rearrange the list: Newer films first (year_film descending), then alphabetically (film ascending)
main_df = main_df.sort_values(by=['year_film', 'film'], ascending=[False, True])

# Save the result
main_df.to_csv('best_picture_director_merged.csv', index=False)

In [4]:
import pandas as pd
import re

# 1. Load the datasets
gg_df = pd.read_csv('golden_globe_awards.csv')
# Start from the base merged list
main_df = pd.read_csv('best_picture_director_merged.csv')

# 2. Filter for WINNERS in relevant Picture categories
target_cats = [
    'Best Motion Picture - Drama',
    'Best Motion Picture - Musical or Comedy',
    'Picture - Musical',
    'Picture - Comedy',
    'Picture'
]

gg_winners = gg_df[
    (gg_df['category'].isin(target_cats)) &
    (gg_df['win'] == True)
].copy()

# 3. Clean and standardize Golden Globe titles
gg_winners['raw_title'] = gg_winners['film'].fillna(gg_winners['nominee'])

def clean_gg_title(t):
    t = str(t).strip()
    t = re.sub(r'\s*\(\d{4}\)$', '', t)  # Remove year

    # Handle "Title, The" format
    if t.endswith(', The'):
        t = 'The ' + t[:-5]
    elif t.endswith(', A'):
        t = 'A ' + t[:-3]
    elif t.endswith(', An'):
        t = 'An ' + t[:-4]

    t = re.sub(r'[^\w\s]', '', t) # Remove punctuation
    return t.strip().lower()

gg_winners['clean_title'] = gg_winners['raw_title'].apply(clean_gg_title)

# Create lookup set
gg_winner_lookup = set()
for _, row in gg_winners.iterrows():
    gg_winner_lookup.add((row['clean_title'], int(row['year_film'])))

# 4. Clean Main List titles
def clean_main_title(t):
    t = str(t).strip()
    if ' or ' in t:
        t = t.split(' or ')[0]
    t = re.sub(r'[^\w\s]', '', t)
    return t.strip().lower()

# 5. Apply check
def check_gg_win(row):
    t = clean_main_title(row['film'])
    y = int(row['year_film'])

    # Check exact and adjacent years
    for offset in [0, -1, 1]:
        if (t, y + offset) in gg_winner_lookup:
            return True
    return False

main_df['Golden_Globe_Picture_Winner'] = main_df.apply(check_gg_win, axis=1)

# 6. Sort and Save
main_df = main_df.sort_values(by=['year_film', 'film'], ascending=[False, True])
main_df.to_csv('best_picture_director_gg_winner_merged.csv', index=False)

In [None]:
# Since the golden globe data is until 2020, the data for the rest movies is added manually