In [1]:
import pandas as pd

# Tabble Assembling

This notebook assembles the results of the scraping into one dataframe

In [2]:
oscars_db = pd.read_csv('./data/downloaded_databases/database_clean.csv')
gg_db = pd.read_csv('./data/downloaded_databases/golden_globe_awards.csv')

osc_scrape = pd.read_csv('./data/scraping_results/osc_bp.csv')
gg_drama = pd.read_csv('./data/scraping_results/gg_drama.csv')
gg_comedy = pd.read_csv('./data/scraping_results/gg_comedy.csv')
pga = pd.read_csv('./data/scraping_results/pga.csv')
bafta = pd.read_csv('./data/scraping_results/bafta.csv')
dga = pd.read_csv('./data/scraping_results/dgas.csv')
sag = pd.read_csv('./data/scraping_results/sag_ensemble.csv')
cannes = pd.read_csv('./data/scraping_results/cannes.csv')

In [3]:
def table_assemble(main_df, to_add_df, show_name):
    """
    Given an awards show scraped from Wikipedia, this function
    Adds it as a feature as a 0/1 flag if it was
    Nominated for that Award and if it won
    """
    nom_col = str('nom_') + show_name
    win_col = str('winner_') + show_name
    # Initalize Columns as no
    main_df[nom_col] = 0
    main_df[win_col] = 0
    to_add_df.columns = ['year','film','wiki','winner_add']
    main_df = pd.merge(main_df, to_add_df[['film','winner_add']], on = 'film', how = 'left')
    main_df.loc[(~main_df['winner_add'].isna()), nom_col] = 1
    main_df.loc[(main_df['winner_add'] == True), win_col] = 1
    main_df = main_df.drop('winner_add', axis = 1)
    return main_df


In [4]:
# Adding a response columns
osc_scrape['Oscar_win'] = 0
osc_scrape.loc[(osc_scrape['winner'] == True), 'Oscar_win'] = 1

In [5]:
# Adding on the results of each festival
scraped_dfs = [gg_drama, gg_comedy, pga, bafta, dga, sag, cannes]
scraped_names = ['gg_drama','gg_comedy','pga', 'bafta', 'dga', 'sag', 'cannes']

for i in range(len(scraped_dfs)):
    osc_scrape = table_assemble(osc_scrape, scraped_dfs[i], scraped_names[i])

osc_scrape

Unnamed: 0,year,film,wiki,winner,Oscar_win,nom_gg_drama,winner_gg_drama,nom_gg_comedy,winner_gg_comedy,nom_pga,winner_pga,nom_bafta,winner_bafta,nom_dga,winner_dga,nom_sag,winner_sag,nom_cannes,winner_cannes
0,1927,Wings (1927 film),/wiki/Wings_(1927_film),True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1927,The Racket (1928 film),/wiki/The_Racket_(1928_film),False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1927,7th Heaven (1927 film),/wiki/7th_Heaven_(1927_film),False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1928,The Broadway Melody,/wiki/The_Broadway_Melody,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1928,Alibi (1929 film),/wiki/Alibi_(1929_film),False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,2019,Little Women (2019 film),/wiki/Little_Women_(2019_film),False,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
559,2019,Marriage Story,/wiki/Marriage_Story,False,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
560,2019,1917 (2019 film),/wiki/1917_(2019_film),False,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0
561,2019,Once Upon a Time in Hollywood,/wiki/Once_Upon_a_Time_in_Hollywood,False,0,0,0,1,1,1,0,1,0,1,0,1,0,1,0


In [6]:
# Some awards didn't exist until certain years
for i in range(len(scraped_dfs)):
    print(scraped_names[i], scraped_dfs[i].year.min())

gg_drama 1943
gg_comedy 1951
pga 1989
bafta 1960
dga 1948
sag 1995
cannes 1970


In [7]:
# Saving the data
osc_scrape.to_csv('./data/processed_results/osc_df')

# Appendix

In [8]:
def table_assemble_nom(main_df, to_add_df, show_name):
    # Initial Strategy to assemble. Disregarded when I realized I could merge on film
    """
    Given an awards show scraped from Wikipedia, this function
    Adds it as a feature as a 0/1 flag if it was
    Nominated for that Award and if it won
    """
    nom_col = str('nom_') + show_name
    win_col = str('winner_') + show_name
    # Initalize Columns as no
    main_df[nom_col] = 0
    main_df[win_col] = 0
    # If the film name was in that df, then it has been nominated
    main_df.loc[(main_df['film'].isin(to_add_df['film'])), nom_col] = 1

    return main_df