In [1]:
# This project follows the same structure as Oscar_Winners.ipynb but it focuses on all nominees not just the winners

In [2]:
# Import functions from file
from functions import loadPage, extractYears, extractFilmData, createDataFrame

In [3]:
# Page 1- Best Picture
# Call function to read in URL and retrieve data from the page
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
# From the data get all the tables that have the following class
tables = data.findAll("table", class_= "wikitable")

In [4]:
# Call function to get the years and the number of films nominated for each year
yearData = extractYears("picture", tables, False)
# Create lists from the dictionary columns
years = yearData['years']
numFilms = yearData['numFilms']

In [5]:
# Call function to get the film data from tables. 
# Notice the third parameter False. This is to change its default value of True so that the function returns all data not just 
# the winners
data = extractFilmData("picture", tables, False)
# Create lists from the dictionary columns
producers = data['names']
films = data['films']
winners = data['winners']

In [6]:
# Check the length of each lists. Producers and films are the same length.
print("Years:      %d" % (len(years)))
print("Producers:  %d" % (len(producers)))
print("Films:      %d" % (len(films)))

Years:      92
Producers:  563
Films:      563


In [7]:
# Call function to create a dataframe by passing in the lists
df_picture = createDataFrame("picture", {"films": films, "producers": producers}, {"years": years, "numFilms": numFilms}, winners)

In [8]:
# Show the first 5 rows of dataframe
df_picture.head()

Unnamed: 0,Year,Film,Producers,Winner
0,1928,Wings,Paramount Famous Lasky,Yes
1,1928,The Racket,The Caddo Company,No
2,1928,7th Heaven,Fox,No
3,1929,The Broadway Melody,Metro-Goldwyn-Mayer,Yes
4,1929,Alibi,Feature Productions,No


In [9]:
# Show the last 5 rows of dataframe
df_picture.tail()

Unnamed: 0,Year,Film,Producers,Winner
558,2019,Joker,"Todd Phillips, Bradley Cooper, and Emma Tillin...",No
559,2019,Little Women,Amy Pascal,No
560,2019,Marriage Story,Noah Baumbach and David Heyman,No
561,2019,1917,"Sam Mendes, Pippa Harris, Jayne-Ann Tenggren, ...",No
562,2019,Once Upon a Time in Hollywood,"David Heyman, Shannon McIntosh, and Quentin Ta...",No


In [10]:
# Page2 - Best Director
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director")
tables = data.findAll("table", class_= "wikitable sortable")

In [11]:
yearData = extractYears("director", tables, False)
years = yearData['years']
numFilms = yearData['numFilms']

In [12]:
data = extractFilmData("director", tables, False)
directors = data['names']
films = data['films']
winners = data['winners']

In [13]:
# Check the length of each list. Directors and films are not the same. 
# This is because in the years 1929, 1930 and 1938 the same director was nominated twice
print("Years:      %d" % (len(years)))
print("Directors:  %d" % (len(directors)))
print("Films:      %d" % (len(films)))

Years:      92
Directors:  452
Films:      455


In [14]:
# Lets amend this by inserting the necessary director at each position
directors.insert(45, directors[44])
directors.insert(13, directors[12])
directors.insert(10, directors[9])

In [15]:
# Now they are the same length
print("Years:      %d" % (len(years)))
print("Directors:  %d" % (len(directors)))
print("Films:      %d" % (len(films)))

Years:      92
Directors:  455
Films:      455


In [16]:
df_directors = createDataFrame("director", {"directors": directors, "films": films}, {"years": years, "numFilms": numFilms}, winners)

In [17]:
df_directors.head()

Unnamed: 0,Year,Director,Film,Winner
0,1928,Frank Borzage (Dramatic Picture),7th Heaven,Yes
1,1928,Herbert Brenon (Dramatic Picture),Sorrell and Son,No
2,1928,King Vidor (Dramatic Picture),The Crowd,No
3,1928,Lewis Milestone (Comedy Picture),Two Arabian Knights,Yes
4,1928,Ted Wilde (Comedy Picture),Speedy,No


In [18]:
df_directors.tail()

Unnamed: 0,Year,Director,Film,Winner
450,2019,Bong Joon-ho,Parasite,Yes
451,2019,Sam Mendes,1917,No
452,2019,Todd Phillips,Joker,No
453,2019,Martin Scorsese,The Irishman,No
454,2019,Quentin Tarantino,Once Upon a Time in Hollywood,No


In [19]:
# Now check the years 1929, 1930 and 1938 and see the directors nominated twice
df_directors.loc[(df_directors['Year'].isin([1929, 1930, 1938])) & (df_directors['Winner'] == "No")]

Unnamed: 0,Year,Director,Film,Winner
6,1929,Lionel Barrymore,Madame X,No
7,1929,Harry Beaumont,The Broadway Melody,No
8,1929,Irving Cummings,In Old Arizona,No
9,1929,Frank Lloyd,Drag,No
10,1929,Frank Lloyd,Weary River,No
11,1929,Ernst Lubitsch,The Patriot,No
13,1930,Clarence Brown,Anna Christie,No
14,1930,Clarence Brown,Romance,No
15,1930,Robert Z. Leonard,The Divorcee,No
16,1930,Ernst Lubitsch,The Love Parade,No


In [20]:
# Page 3- Best Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [21]:
yearData = extractYears("actor", tables, False)
years = yearData['years']
numFilms = yearData['numFilms']

In [22]:
data = extractFilmData("actor", tables, False)
actors = data['names']
films = data['films']
winners = data['winners']

In [23]:
# Check the length of the lists. Actors and films are not the same. 
# This is due to several years having actors nominated for multiple films and several films having multiple actors nominated
print("Years:   %d" % (len(years)))
print("Actors:  %d" % (len(actors)))
print("Films:   %d" % (len(films)))

Years:   92
Actors:  452
Films:   451


In [24]:
# To amend this we have to insert several times in both lists
actors.insert(1, actors[0])
actors.insert(3, actors[2])
actors.insert(13, actors[12])
actors.insert(15, actors[14])
films.insert(33, films[32])
films.insert(125, films[124])
films.insert(178, films[177])
films.insert(218, films[217])
films.insert(275, films[274])

In [25]:
# Now they are the same length
print("Years:   %d" % (len(years)))
print("Actors:  %d" % (len(actors)))
print("Films:   %d" % (len(films)))

Years:   92
Actors:  456
Films:   456


In [26]:
# Because of the actors we just inserted, we need to update numFilms to accomodate this
numFilms[0] = int(numFilms[0]) + 2
numFilms[2] = int(numFilms[2]) + 2

In [27]:
df_actors = createDataFrame("actor", {"films": films, "actors": actors}, {"years": years, "numFilms": numFilms}, winners)

In [28]:
df_actors.head()

Unnamed: 0,Year,Actor,Film,Winner
0,1928,Emil Jannings,The Last Command,Yes
1,1928,Emil Jannings,The Way of All Flesh,Yes
2,1928,Richard Barthelmess,The Noose,No
3,1928,Richard Barthelmess,The Patent Leather Kid,No
4,1929,Warner Baxter,In Old Arizona,Yes


In [29]:
df_actors.tail()

Unnamed: 0,Year,Actor,Film,Winner
451,2019,Joaquin Phoenix,Joker,Yes
452,2019,Antonio Banderas,Pain and Glory,No
453,2019,Leonardo DiCaprio,Once Upon a Time in Hollywood,No
454,2019,Adam Driver,Marriage Story,No
455,2019,Jonathan Pryce,The Two Popes,No


In [30]:
# Now check the years 1928 and 1930 and see the actors that were nominated twice
df_actors.loc[(df_actors["Year"].isin([1928, 1930])) & (df_actors["Winner"] == "No")]

Unnamed: 0,Year,Actor,Film,Winner
2,1928,Richard Barthelmess,The Noose,No
3,1928,Richard Barthelmess,The Patent Leather Kid,No
10,1930,George Arliss,The Green Goddess,No
11,1930,Wallace Beery,The Big House,No
12,1930,Maurice Chevalier,The Big Pond,No
13,1930,Maurice Chevalier,The Love Parade,No
14,1930,Ronald Colman,Bulldog Drummond,No
15,1930,Ronald Colman,Condemned,No
16,1930,Lawrence Tibbett,The Rogue Song,No


In [31]:
# Now check the years inserted and see that multiple actor nominations for the same films
years = [1935, 1953, 1964, 1972, 1983]
df_actors.loc[(df_actors["Year"].isin(years)) & (df_actors["Winner"] == "No")]

Unnamed: 0,Year,Actor,Film,Winner
32,1935,Clark Gable,Mutiny on the Bounty,No
33,1935,Charles Laughton,Mutiny on the Bounty,No
34,1935,Paul Muni,Black Fury,No
35,1935,Franchot Tone,Mutiny on the Bounty,No
122,1953,Marlon Brando,Julius Caesar,No
123,1953,Richard Burton,The Robe,No
124,1953,Montgomery Clift,From Here to Eternity,No
125,1953,Burt Lancaster,From Here to Eternity,No
177,1964,Richard Burton,Becket,No
178,1964,Peter O'Toole,Becket,No


In [32]:
# Page 4 - Best Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [33]:
yearData = extractYears("actress", tables, False)
years = yearData['years']
numFilms = yearData['numFilms']

In [34]:
data = extractFilmData("actress", tables, False)
actresses = data['names']
films = data['films']
winners = data['winners']

In [35]:
# Check the lengths of the lists. They are not the same. The reasons for this are 
# 1. The same actress receiving multiple nominations for different films in the same year
# 2. Different actresses receiving nominations for the same film
print("Years:       %d" % (len(years)))
print("Actressess:  %d" % (len(actresses)))
print("Films:       %d" % (len(films)))

Years:       92
Actressess:  456
Films:       457


In [36]:
actresses.insert(1, actresses[0])
actresses.insert(2, actresses[0])
actresses.insert(15, actresses[14])
films.insert(111, films[110])
films.insert(158, films[157])

In [37]:
# Now they are the same length
print("Years:       %d" % (len(years)))
print("Actressess:  %d" % (len(actresses)))
print("Films:       %d" % (len(films)))

Years:       92
Actressess:  459
Films:       459


In [38]:
# Because of the actresses we just inserted, we need to update numFilms to accomodate this
numFilms[0] = int(numFilms[0]) +2
numFilms[2] = int(numFilms[2]) +1

In [39]:
df_actresses = createDataFrame("actress", {"films": films, "actresses": actresses}, {"years": years, "numFilms": numFilms}, winners)

In [40]:
df_actresses.head()

Unnamed: 0,Year,Actress,Film,Winner
0,1928,Janet Gaynor,7th Heaven,Yes
1,1928,Janet Gaynor,Street Angel,Yes
2,1928,Janet Gaynor,Sunrise: A Song of Two Humans,Yes
3,1928,Louise Dresser,A Ship Comes In,No
4,1928,Gloria Swanson,Sadie Thompson,No


In [41]:
df_actresses.tail()

Unnamed: 0,Year,Actress,Film,Winner
454,2019,Renée Zellweger,Judy,Yes
455,2019,Cynthia Erivo,Harriet,No
456,2019,Scarlett Johansson,Marriage Story,No
457,2019,Saoirse Ronan,Little Women,No
458,2019,Charlize Theron,Bombshell,No


In [42]:
# Now check the year 1930 and see Greta Garbo received two nominations
df_actresses.loc[(df_actresses["Year"] == 1930) & (df_actresses["Winner"] == "No")]

Unnamed: 0,Year,Actress,Film,Winner
12,1930,Nancy Carroll,The Devil's Holiday,No
13,1930,Ruth Chatterton,Sarah and Son,No
14,1930,Greta Garbo,Anna Christie,No
15,1930,Greta Garbo,Romance,No
16,1930,Norma Shearer,Their Own Desire,No
17,1930,Gloria Swanson,The Trespasser,No


In [43]:
# Now check the years 1950 and 1959 and see that multiple actress nominations for the same films
df_actresses.loc[(df_actresses["Year"].isin([1950, 1959])) & (df_actresses["Winner"] == "No")]

Unnamed: 0,Year,Actress,Film,Winner
110,1950,Anne Baxter,All About Eve,No
111,1950,Bette Davis,All About Eve,No
112,1950,Eleanor Parker,Caged,No
113,1950,Gloria Swanson,Sunset Boulevard,No
155,1959,Doris Day,Pillow Talk,No
156,1959,Audrey Hepburn,The Nun's Story,No
157,1959,Katharine Hepburn,"Suddenly, Last Summer",No
158,1959,Elizabeth Taylor,"Suddenly, Last Summer",No


In [44]:
# Page 5 - Best Supporting Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [45]:
yearData = extractYears("supporting actor", tables, False)
years = yearData['years']
numFilms = yearData['numFilms']

In [46]:
data = extractFilmData("supporting actor", tables, False)
actors = data['names']
films = data['films']
winners = data['winners']

In [47]:
# Check the length of the lists. 
# Actors and films are not the same because in several years multiple actors were nominated for
# the same film
print("Years:   %d" % (len(years)))
print("Actors:  %d" % (len(actors)))
print("Films:   %d" % (len(films)))

Years:   84
Actors:  420
Films:   408


In [48]:
films.insert(88, films[87])
films.insert(92, films[91])
films.insert(93, films[91])
films.insert(109, films[108])
films.insert(117, films[116])
films.insert(129, films[128])
films.insert(183, films[182])
films.insert(184, films[182])
films.insert(194, films[193])
films.insert(252, films[251])
films.insert(278, films[277])
films.insert(419, films[418])

In [49]:
# Now they are the same length
print("Years:   %d" % (len(years)))
print("Actors:  %d" % (len(actors)))
print("Films:   %d" % (len(films)))

Years:   84
Actors:  420
Films:   420


In [50]:
df_supActors = createDataFrame("supporting actor", {"films": films, "actors": actors}, {"years": years, "numFilms": numFilms}, winners)

In [51]:
df_supActors.head()

Unnamed: 0,Year,Actor,Film,Winner
0,1936,Walter Brennan,Come and Get It,Yes
1,1936,Mischa Auer,My Man Godfrey,No
2,1936,Stuart Erwin,Pigskin Parade,No
3,1936,Basil Rathbone,Romeo and Juliet,No
4,1936,Akim Tamiroff,The General Died at Dawn,No


In [52]:
df_supActors.tail()

Unnamed: 0,Year,Actor,Film,Winner
415,2019,Brad Pitt,Once Upon a Time in Hollywood,Yes
416,2019,Tom Hanks,A Beautiful Day in the Neighborhood,No
417,2019,Anthony Hopkins,The Two Popes,No
418,2019,Al Pacino,The Irishman,No
419,2019,Joe Pesci,The Irishman,No


In [53]:
# Now check each of the years that were inserted and see that each one has two or more occurrences of the same film
years = [1953, 1954, 1957, 1959, 1961, 1972, 1974, 1986, 1991, 2019]
df_supActors.loc[(df_supActors["Year"].isin(years)) & (df_supActors["Winner"] == "No")]

Unnamed: 0,Year,Actor,Film,Winner
86,1953,Eddie Albert,Roman Holiday,No
87,1953,Brandon deWilde,Shane,No
88,1953,Jack Palance,Shane,No
89,1953,Robert Strauss,Stalag 17,No
91,1954,Lee J. Cobb,On the Waterfront,No
92,1954,Karl Malden,On the Waterfront,No
93,1954,Rod Steiger,On the Waterfront,No
94,1954,Tom Tully,The Caine Mutiny,No
106,1957,Vittorio De Sica,A Farewell to Arms,No
107,1957,Sessue Hayakawa,The Bridge on the River Kwai,No


In [54]:
# Page 6 - Best Supporting Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [55]:
yearData = extractYears("supporting actress", tables, False)
years = yearData['years']
numFilms = yearData['numFilms']

In [56]:
data = extractFilmData("supporting actress", tables, False)
actresses = data['names']
films = data['films']
winners = data['winners']

In [57]:
# Check the length of the lists. 
# Actresses and films are not the same because in several years multiple actresses were nominated 
# for the same film
print("Years:      %d" % (len(years)))
print("Actresses:  %d" % (len(actresses)))
print("Films:      %d" % (len(films)))

Years:      84
Actresses:  420
Films:      405


In [58]:
films.insert(28, films[27])
films.insert(47, films[46])
films.insert(62, films[61])
films.insert(68, films[67])
films.insert(94, films[93])
films.insert(109, films[108])
films.insert(118, films[117])
films.insert(137, films[136])
films.insert(138, films[136])
films.insert(148, films[147])
films.insert(267, films[266])
films.insert(323, films[322])
films.insert(327, films[326])
films.insert(362, films[361])
films.insert(414, films[413])

In [59]:
# Now they are the same length
print("Years:      %d" % (len(years)))
print("Actresses:  %d" % (len(actresses)))
print("Films:      %d" % (len(films)))

Years:      84
Actresses:  420
Films:      420


In [60]:
df_supActresses = createDataFrame("supporting actress", {"films": films, "actresses": actresses}, {"years": years, "numFilms": numFilms}, winners)

In [61]:
df_supActresses.head()

Unnamed: 0,Year,Actress,Film,Winner
0,1936,Gale Sondergaard,Anthony Adverse,Yes
1,1936,Beulah Bondi,The Gorgeous Hussy,No
2,1936,Alice Brady,My Man Godfrey,No
3,1936,Bonita Granville,These Three,No
4,1936,Maria Ouspenskaya,Dodsworth,No


In [62]:
df_supActresses.tail()

Unnamed: 0,Year,Actress,Film,Winner
415,2019,Laura Dern,Marriage Story,Yes
416,2019,Kathy Bates,Richard Jewell,No
417,2019,Scarlett Johansson,Jojo Rabbit,No
418,2019,Florence Pugh,Little Women,No
419,2019,Margot Robbie,Bombshell,No


In [63]:
# Now check each of the years that were inserted and see that each one has two or more occurrences of the same film
years = [1941, 1945, 1948, 1949, 1954, 1957, 1959, 1963, 1965, 1989, 2000, 2001, 2008, 2018]
df_supActresses.loc[(df_supActresses["Year"].isin(years)) & (df_supActresses["Winner"] == "No")]

Unnamed: 0,Year,Actress,Film,Winner
26,1941,Sara Allgood,How Green Was My Valley,No
27,1941,Patricia Collinge,The Little Foxes,No
28,1941,Teresa Wright,The Little Foxes,No
29,1941,Margaret Wycherly,Sergeant York,No
46,1945,Eve Arden,Mildred Pierce,No
47,1945,Ann Blyth,Mildred Pierce,No
48,1945,Angela Lansbury,The Picture of Dorian Gray,No
49,1945,Joan Lorring,The Corn is Green,No
61,1948,Barbara Bel Geddes,I Remember Mama,No
62,1948,Ellen Corby,I Remember Mama,No


In [64]:
# Look at the number of rows in each data frame
print("Total best picture winners:             %d" % (df_picture.shape[0]))
print("Total best director winners:            %d" % (df_directors.shape[0]))
print("Total best actor winners:               %d" % (df_actors.shape[0]))
print("Total best actress winners:             %d" % (df_actresses.shape[0]))
print("Total best supporting actor winners:    %d" % (df_supActors.shape[0]))
print("Total best supporting actress winners:  %d" % (df_supActresses.shape[0]))

Total best picture winners:             563
Total best director winners:            455
Total best actor winners:               456
Total best actress winners:             459
Total best supporting actor winners:    420
Total best supporting actress winners:  420


In [65]:
# Now all the data has been read into data frames, the next step is to connect to MYSQL and store it in the database
from sqlalchemy import create_engine    # For connecting to MySQL
from MySQL_connect import config        # Import connection parameters from file

# Use the parameters to create connection variables
user = config['user']
password = config['password']
host = config['host']
db = config['db']

# Connection object for MySQL
engine = create_engine("mysql+mysqldb://"+user+":"+password+"@"+host+"/"+db+"?charset=utf8")


In [66]:
# Create table nominee_best_picture and add data to it
# Check if table already exists, if it does then delete it
if(engine.execute("SHOW TABLES LIKE 'nominee_best_picture'").fetchone()):
    engine.execute("DROP TABLE nominee_best_picture")

# Create table by defining its structure and column data types
# Character set utf8 is used to prevent errors caused by non ascii characters
engine.execute("CREATE TABLE nominee_best_picture ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Producers CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

# Loop through each row of the dataframe and add it to the table
for index, row in df_picture.iterrows():
    engine.execute("INSERT INTO nominee_best_picture (Year, Film, Producers, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [67]:
# Create table nominee_best_director and add data to it

if(engine.execute("SHOW TABLES LIKE 'nominee_best_director'").fetchone()):
    engine.execute("DROP TABLE nominee_best_director")

engine.execute("CREATE TABLE nominee_best_director ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Director CHAR(255) CHARACTER SET UTF8, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

for index, row in df_directors.iterrows():
    engine.execute("INSERT INTO nominee_best_director (Year, Director, Film, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [68]:
# Create table nominee_best_actor and add data to it

if(engine.execute("SHOW TABLES LIKE 'nominee_best_actor'").fetchone()):
    engine.execute("DROP TABLE nominee_best_actor")

engine.execute("CREATE TABLE nominee_best_actor ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Actor CHAR(255) CHARACTER SET UTF8, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

for index, row in df_actors.iterrows():
    engine.execute("INSERT INTO nominee_best_actor (Year, Actor, Film, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [69]:
# Create table nominee_best_actress and add data to it

if(engine.execute("SHOW TABLES LIKE 'nominee_best_actress'").fetchone()):
    engine.execute("DROP TABLE nominee_best_actress")

engine.execute("CREATE TABLE nominee_best_actress ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Actress CHAR(255) CHARACTER SET UTF8, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

for index, row in df_actresses.iterrows():
    engine.execute("INSERT INTO nominee_best_actress (Year, Actress, Film, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [70]:
# Create table nominee_best_supporting_actor and add data to it

if(engine.execute("SHOW TABLES LIKE 'nominee_best_supporting_actor'").fetchone()):
    engine.execute("DROP TABLE nominee_best_supporting_actor")

engine.execute("CREATE TABLE nominee_best_supporting_actor ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Actor CHAR(255) CHARACTER SET UTF8, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

for index, row in df_supActors.iterrows():
    engine.execute("INSERT INTO nominee_best_supporting_actor (Year, Actor, Film, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [71]:
# Create table nominee_best_supporting_actress and add data to it

if(engine.execute("SHOW TABLES LIKE 'nominee_best_supporting_actress'").fetchone()):
    engine.execute("DROP TABLE nominee_best_supporting_actress")

engine.execute("CREATE TABLE nominee_best_supporting_actress ( \
               Id INT NOT NULL AUTO_INCREMENT, \
               Year INT, \
               Actress CHAR(255) CHARACTER SET UTF8, \
               Film CHAR(255) CHARACTER SET UTF8, \
               Winner CHAR(5), \
               PRIMARY KEY(Id) )")

for index, row in df_supActresses.iterrows():
    engine.execute("INSERT INTO nominee_best_supporting_actress (Year, Actress, Film, Winner) \
    VALUES(%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))

In [72]:
# Now lets run some queries on the tables
# First return the number of rows in each table
num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_picture")
print("Nominee Best Picture number of rows:            %d" % (num_res.scalar()))

num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_director")
print("Nominee Best Director number of rows:           %d" % (num_res.scalar()))

num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_actor")
print("Nominee Best Actor number of rows:              %d" % (num_res.scalar()))

num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_actress")
print("Nominee Best Actress number of rows:            %d" % (num_res.scalar()))

num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_supporting_actor")
print("Nominee Best Supporting Actor number of rows:   %d" % (num_res.scalar()))

num_res = engine.execute("SELECT COUNT(*) FROM nominee_best_supporting_actress")
print("Nominee Best Supporting Actress number of rows: %d" % (num_res.scalar()))


Nominee Best Picture number of rows:            563
Nominee Best Director number of rows:           455
Nominee Best Actor number of rows:              456
Nominee Best Actress number of rows:            459
Nominee Best Supporting Actor number of rows:   420
Nominee Best Supporting Actress number of rows: 420


In [73]:
# For displaying results in a table format
from prettytable import PrettyTable

In [74]:
# Query: Return a list of actors ordered by most nominations and wins
# Explained: We are using two tables - nominee_best_actor and nominee_best_supporting_actor. 
# Therefore we use UNION inside a subquery to combine the results of two queries into one set. 
# Each query counts the number of nominatins for each actor as well as the number of times they won. 
# We get the total for each actor by summing the results of each query.
actor_mostNom = engine.execute("\
SELECT actor, SUM(total_wins) AS total_wins, SUM(total_nom) AS total_nom \
FROM( \
    SELECT actor, \
           COUNT(actor) AS total_nom, \
           COUNT(IF(winner = 'Yes', 1, null)) AS total_wins \
    FROM nominee_best_actor \
    GROUP BY actor \
    UNION \
    SELECT actor, \
           COUNT(actor) AS total_nom, \
           COUNT(IF(winner = 'Yes', 1, null)) AS total_wins \
    FROM nominee_best_supporting_actor \
    GROUP BY actor \
) AS res \
GROUP BY actor \
ORDER BY total_nom DESC, total_wins DESC, actor ASC")

table = PrettyTable(['Actor', 'Nominations', 'Wins'])
for row in actor_mostNom:
    table.add_row([row['actor'], row['total_nom'], row['total_wins']])
print(table)

+------------------------+-------------+------+
|         Actor          | Nominations | Wins |
+------------------------+-------------+------+
|     Jack Nicholson     |      12     |  3   |
|    Laurence Olivier    |      10     |  1   |
|     Spencer Tracy      |      9      |  2   |
|       Al Pacino        |      9      |  1   |
|      Paul Newman       |      9      |  1   |
|   Denzel Washington    |      8      |  2   |
|      Jack Lemmon       |      8      |  2   |
|     Marlon Brando      |      8      |  2   |
|     Peter O'Toole      |      8      |  0   |
|     Dustin Hoffman     |      7      |  2   |
|     Robert De Niro     |      7      |  2   |
|      Jeff Bridges      |      7      |  1   |
|     Robert Duvall      |      7      |  1   |
|     Richard Burton     |      7      |  0   |
|    Daniel Day-Lewis    |      6      |  3   |
|     Michael Caine      |      6      |  2   |
|       Tom Hanks        |      6      |  2   |
|   Leonardo DiCaprio    |      6      |

In [75]:
# Query: Return a list of actresses ordered by most nominations and wins
# Explained: We are using two tables - nominee_best_actress and nominee_best_supporting_actress. 
# Therefore we use UNION inside a subquery to combine the results of two queries into one set. 
# Each query counts the number of nominations for each actress as well as the number of times they won. 
# We get the total for each actress by summing the results of each query.
supActor_mostNom = engine.execute("\
SELECT actress, SUM(total_wins) AS total_wins, SUM(total_nom) AS total_nom \
FROM( \
    SELECT actress, \
           COUNT(actress) AS total_nom, \
           COUNT(IF(winner = 'Yes', 1, null)) AS total_wins \
    FROM nominee_best_actress \
    GROUP BY actress \
    UNION \
    SELECT actress, \
           COUNT(actress) AS total_nom, \
           COUNT(IF(winner = 'Yes', 1, null)) AS total_wins \
    FROM nominee_best_supporting_actress \
    GROUP BY actress \
) AS res \
GROUP BY actress \
ORDER BY total_nom DESC, total_wins DESC, actress ASC")

table = PrettyTable(['Actress', 'Nominations', 'Wins'])
for row in supActor_mostNom:
    table.add_row([row['actress'], row['total_nom'], row['total_wins']])
print(table)

+-----------------------------+-------------+------+
|           Actress           | Nominations | Wins |
+-----------------------------+-------------+------+
|         Meryl Streep        |      21     |  3   |
|      Katharine Hepburn      |      12     |  4   |
|         Bette Davis         |      11     |  2   |
|        Geraldine Page       |      8      |  1   |
|        Ingrid Bergman       |      7      |  3   |
|        Cate Blanchett       |      7      |  2   |
|          Jane Fonda         |      7      |  2   |
|         Greer Garson        |      7      |  1   |
|          Judi Dench         |      7      |  1   |
|         Kate Winslet        |      7      |  1   |
|         Glenn Close         |      7      |  0   |
|        Jessica Lange        |      6      |  2   |
|         Maggie Smith        |      6      |  2   |
|        Ellen Burstyn        |      6      |  1   |
|        Norma Shearer        |      6      |  1   |
|         Sissy Spacek        |      6      | 

In [76]:
# Query: Return a list of directors ordered by most nominations and wins
# Explained: Because the 1st Osars had winners in different categories we have to use 
# substring to get rid of any brackets at the end of any names. 
# With the names correct we can now count the number of nominations and the number of wins for each director.
director_mostNom = engine.execute("\
SELECT director, SUM(total_wins) AS total_wins, SUM(total_nom) AS total_nom \
FROM ( \
    SELECT IF(SUBSTRING(director, LENGTH(director)) = ')', \
              SUBSTRING(director, 1, POSITION('(' IN director) -1), \
              director \
    ) AS director, \
    COUNT(IF(SUBSTRING(director, LENGTH(director)) = ')', \
             SUBSTRING(director, 1, POSITION('(' IN director) -1), \
             director) \
    ) AS total_nom, \
    COUNT(IF (winner = 'Yes', 1, null )) AS total_wins \
    FROM nominee_best_director \
    GROUP BY director \
) AS res \
GROUP BY director \
ORDER BY total_nom DESC, total_wins DESC, director ASC")
 
table = PrettyTable(['Director', 'Nominations', 'Wins'])
for row in director_mostNom:
    table.add_row([row['director'], row['total_nom'], row['total_wins']])
print(table)

+------------------------------+-------------+------+
|           Director           | Nominations | Wins |
+------------------------------+-------------+------+
|        William Wyler         |      12     |  3   |
|       Martin Scorsese        |      9      |  1   |
|         Billy Wilder         |      8      |  2   |
|          David Lean          |      7      |  2   |
|        Fred Zinnemann        |      7      |  2   |
|       Steven Spielberg       |      7      |  2   |
|         Woody Allen          |      7      |  1   |
|         Frank Capra          |      6      |  3   |
|        Clarence Brown        |      6      |  0   |
|          John Ford           |      5      |  4   |
|          Elia Kazan          |      5      |  2   |
|         Frank Lloyd          |      5      |  2   |
|        George Stevens        |      5      |  2   |
|         George Cukor         |      5      |  1   |
|         John Huston          |      5      |  1   |
|       Alfred Hitchcock    

In [77]:
# Query: Return a list of producers ordered by most nominations and wins
# Explained: This query is difficult because the producers column can have multiple names in it. 
# We are interested in the first name as that is usually the director. 
# The names can be seperated by either commas or 'and' so we use search for the position of the first comma. 
# If a comma is found then use that as the maximum index for the substring. 
# If not then search for the position of ' and ' and use that as the maximum index. 
# Finally return the substring and use it to count each occurrence and the number of wins for each.
picture_mostNom = engine.execute("\
SELECT IF(POSITION(',' IN producers) > 0, \
           SUBSTRING(producers, 1, POSITION(',' IN producers) -1), \
           IF(POSITION(' and ' IN producers) > 0, \
               SUBSTRING(producers, 1, POSITION(' and ' IN producers)), \
               producers) \
) AS name,\
     COUNT(producers) AS total_nom, \
     COUNT(IF(winner = 'Yes', 1, null)) AS total_wins \
FROM nominee_best_picture \
GROUP BY name \
ORDER BY total_nom DESC, total_wins DESC, name ASC")
 
table = PrettyTable(['Producer', 'Nominations', 'Wins'])
for row in picture_mostNom:
    table.add_row([row['name'], row['total_nom'], row['total_wins']])
print(table)

+-------------------------------------+-------------+------+
|               Producer              | Nominations | Wins |
+-------------------------------------+-------------+------+
|         Metro-Goldwyn-Mayer         |      40     |  5   |
|             Warner Bros.            |      20     |  2   |
|           20th Century-Fox          |      16     |  3   |
|               Columbia              |      13     |  3   |
|              Paramount              |      13     |  2   |
|              RKO Radio              |      11     |  1   |
|      Samuel Goldwyn Productions     |      8      |  1   |
|          Steven Spielberg           |      8      |  1   |
|                 Fox                 |      6      |  1   |
|             Scott Rudin             |      6      |  1   |
|            Stanley Kramer           |      6      |  0   |
|   Selznick International Pictures   |      5      |  2   |
|        Francis Ford Coppola         |      5      |  1   |
|             Sam Spiege