In [1]:
# This project involves using Beautiful Soup to extract information relating to Oscar winners from Wikipedia pages. In total 
# six different pages will be accessed in order to obtain the following:
# 1. Best Picture 
# 2. Best Director
# 3. Best Actor
# 4. Best Actress
# 5. Best Supporting Actor
# 6. Best Supporting Actress
# Once the data has been retrieved from each page, it will be stored in dataframes. I will then connect to MySQL, create tables
# and insert the data there. Finally I will run queries on the database to provide useful insight.


In [2]:
import urllib2;                    # for retrieving the contents of a page
from bs4 import BeautifulSoup;     # for parsing the contents of a page to HTML
import pandas as pd                # for creating dataframes
import re                          # for regular expressions

In [3]:
# function that takes in a URL and returns the contents of the page
def loadPage(url):
    page = urllib2.urlopen(url)
    data = BeautifulSoup(page, "lxml")
    return data

# style of table cells that have Oscar winners in them
winnerStyle = 'background:#FAEB86'

In [4]:
# Page 1- Best Picture
# Call function to read in URL and retrieve data from the page
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
# from the data get all the tables that have the following class
tables = data.findAll("table", class_= "wikitable")

In [5]:
# There are tables for each decade starting from the 1920's. We want to go through each table and extract the appropriate information
# Create empty lists for each film and nominee
films = []
nominees = []

# There are ten tables so we use range to keep track 
for i in range(0, 11):
    # For each table get each row start with the following style ie. could be 'background:#FAEB86' or 'background:#FAEB86;'
    for row in tables[i].findAll("tr", style=re.compile('^'+winnerStyle)):
        # Get each cell within that row
        cells = row.findAll('td')
        # Get the text from within the first cell, encode it to utf-8 and add it to the films list
        if(len(cells) > 0):        
            film = cells[0].find(text=True).encode('utf-8').strip()
            films.append(film)
            # Get the text from within the second cell, encode it and add it to the list of nominees
        if(len(cells) > 1):      
            nominee = cells[1].findAll(text=True)[:-1]
            nominees.append("".join(nominee).encode('utf-8').strip() )    

In [6]:
# Here we extract the years. Years can be different formats depending on how far back they are ie. 1927/28, 1927/1928, 1928
# Create an empty list for each year
years = []
for i in range(0, 11):  
    # For each table row get each cell with the following style
    for row in tables[i].findAll("td", style="text-align:center"):
            # Get each link within that row
            cells = row.findAll('a')
            # If year is all in one link and in a format like 1927/28 then we want the first two and last two charaters so 
            # that it will be 1928
            if(len(cells[0].find(text=True)) == 7):
                year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
            # If year is in two links and the second link has four numbers ie. 1927/1928 then we want the second link
            elif(len(cells[1].find(text=True)) == 4):       
                if (cells[1].find(text=True).isnumeric()):
                    year = cells[1].find(text=True)
            # If the year is in two links and the second link has two numbers ie. 1927/28 then we want the first two characters
            # of the first link and the last two of the second link
            elif(len(cells[1].find(text=True)) == 2):
                if (cells[1].find(text=True).isnumeric()):
                    year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
            # Else the year is in a standard format ie. 1928 so get four characters
            else:
                year = cells[0].find(text=True)[0:4]
            # add year to list of years
            years.append(year)

In [7]:
# Check the length of each list. They are all the same
print(len(films))
print(len(nominees))
print(len(years))

90
90
90


In [8]:
# Create a data frame for storing each list
df_picture = pd.DataFrame()
df_picture["Year"] = pd.to_numeric(years, errors='coerce')
df_picture["Film"] = films
df_picture["Nominee"] = nominees
df_picture

Unnamed: 0,Year,Film,Nominee
0,1928,Wings,Paramount Famous Lasky
1,1929,The Broadway Melody,Metro-Goldwyn-Mayer
2,1930,All Quiet on the Western Front,Universal
3,1931,Cimarron,RKO Radio
4,1932,Grand Hotel,Metro-Goldwyn-Mayer
5,1933,Cavalcade,Fox
6,1934,It Happened One Night,Columbia
7,1935,Mutiny on the Bounty,Metro-Goldwyn-Mayer
8,1936,The Great Ziegfeld,Metro-Goldwyn-Mayer
9,1937,The Life of Emile Zola,Warner Bros.


In [9]:
# Page2 - Best Director
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director")
tables = data.findAll("table", class_= "wikitable sortable")

In [10]:
# Here we extract each film and its director. All the data is stored in one table.
# Create empty lists for storing each film and director
films =[]
directors = []
# Get each row within the table
for row in tables[0].findAll("tr"):
        cells = row.findAll("td", style=re.compile('^'+winnerStyle))
        # For the first year there was two categories of winners so we want to include that information. It is inside a span tag.
        # FindAll() gets all the text within the cell including child tags. 
        #Finally use join() to merge the text into one string and add it to the list of direcors
        if(len(cells) > 0):
            director = cells[0].findAll(text=True)[:-1]
            directors.append("".join(director).encode('utf-8').strip())  
        # Get the film from the second cell and add it to the list of films
        if(len(cells) > 1):
            film = cells[1].find(text=True).encode('utf-8').strip() 
            films.append(film)

In [11]:
# Here we extract the years.
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4]
    years.append(year)

In [12]:
# Check the length of each list. Due to the 1st Oscars having multiple awards they are not the same length
print(len(films))
print(len(directors))
print(len(years))

91
91
90


In [13]:
# To amend this lets insert a year at the position 1 with the same value as the position 0
years.insert(1, years[0])

In [14]:
# Now they are all the same length
print(len(films))
print(len(directors))
print(len(years))

91
91
91


In [15]:
# Create a data frame for storing each list and print it
df_directors = pd.DataFrame()
df_directors["Year"] = pd.to_numeric(years, errors='coerce')
df_directors["Director"] = directors
df_directors["Film"] = films
# Look at the data frame.
df_directors

Unnamed: 0,Year,Director,Film
0,1928,Frank Borzage (Dramatic),7th Heaven
1,1928,Lewis Milestone (Comedy),Two Arabian Knights
2,1929,Frank Lloyd,The Divine Lady
3,1930,Lewis Milestone,All Quiet on the Western Front
4,1931,Norman Taurog,Skippy
5,1932,Frank Borzage,Bad Girl
6,1933,Frank Lloyd,Cavalcade
7,1934,Frank Capra,It Happened One Night
8,1935,John Ford,The Informer
9,1936,Frank Capra,Mr. Deeds Goes to Town


In [16]:
# Page 3- Best Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [17]:
# Here we extract each actor and film
films =[]
actors = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style=re.compile('^'+winnerStyle))    
    # Index variable to keep track of the cell we are in
    index = 0
    for c in cells:
        # Get all the links within each cell
        links = c.findAll("a")
        for l in links:
            # If link has the attribute 'title' then we want it
            if(l.has_attr('title')):
                # If it is the first cell then get the text and add it to the list of actors
                if(index == 0): 
                    actor = l.find(text=True).encode('utf-8').strip()
                    actors.append(actor)
                # If it is the third cell then get the text and add it to the list of films
                elif(index == 2): 
                    film = l.find(text=True).encode('utf-8').strip()
                    films.append(film)
        # increment index
        index = index+1         
        

In [18]:
# Here we extract the years
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4]
    years.append(year)

In [19]:
# Check the length of each list. They are not the same. There are two reasons for this.
# 1. In the 1st Oscars the actor won awards for two films
# 2. In 1932 there was a tie for 1st place
print(len(films))
print(len(actors))
print(len(years))

92
91
90


In [20]:
# To amend this lets insert the actor at position 0 into position 1 and insert the appropriate years at position 1 and 6 of years
actors.insert(1, actors[0])
years.insert(1, years[0])
years.insert(6, years[5])

In [21]:
# Now they are all the same length
print(len(films))
print(len(actors))
print(len(years))

92
92
92


In [22]:
# Create a data frame for storing the lists
df_actors = pd.DataFrame()
df_actors["Year"] = pd.to_numeric(years, errors='coerce')
df_actors["Actor"] = actors
df_actors["Film"] = films
# Print the data frame and look at years 1928 and 1932
df_actors

Unnamed: 0,Year,Actor,Film
0,1928,Emil Jannings,The Last Command
1,1928,Emil Jannings,The Way of All Flesh
2,1929,Warner Baxter,In Old Arizona
3,1930,George Arliss,Disraeli
4,1931,Lionel Barrymore,A Free Soul
5,1932,Wallace Beery,The Champ
6,1932,Fredric March,Dr. Jekyll and Mr. Hyde
7,1933,Charles Laughton,The Private Life of Henry VIII
8,1934,Clark Gable,It Happened One Night
9,1935,Victor McLaglen,The Informer


In [23]:
# Page 4- Best Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [24]:
# Here we extract each actress and film
films =[]
actresses = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style=re.compile('^'+winnerStyle))   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actress = l.find(text=True).encode('utf-8').strip()
                    actresses.append(actress)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1         

In [25]:
# Here we extract each year
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4] 
    years.append(year)

In [26]:
# Check the length of each list. They are not the same. There are two reasons for this.
# 1. In the 1st Oscars the actress won awards for three films
# 2. In 1968 there was a tie for 1st place
print(len(actresses))
print(len(films))
print(len(years))

91
93
90


In [27]:
# Lets amend this by inserting the actress at the position 0 to position 1 and 2. Do the same for the years
# Also insert the year at position 40 into position 41
actresses.insert(1, actresses[0])
actresses.insert(2, actresses[1])
years.insert(41, years[40])
years.insert(1, years[0])
years.insert(2, years[1])

In [28]:
# Now they are all the same length
print(len(actresses))
print(len(films))
print(len(years))

93
93
93


In [29]:
# Create a data frame for storing the lists
df_actresses = pd.DataFrame()
df_actresses["Year"] = pd.to_numeric(years, errors='coerce')
df_actresses["Actress"] = actresses
df_actresses["Film"] = films
# Print the dataframe and look at the first three indexes which have the same year and actress but different films
df_actresses

Unnamed: 0,Year,Actress,Film
0,1928,Janet Gaynor,7th Heaven
1,1928,Janet Gaynor,Street Angel
2,1928,Janet Gaynor,Sunrise: A Song of Two Humans
3,1929,Mary Pickford,Coquette
4,1930,Norma Shearer,The Divorcee
5,1931,Marie Dressler,Min and Bill
6,1932,Helen Hayes,The Sin of Madelon Claudet
7,1933,Katharine Hepburn,Morning Glory
8,1934,Claudette Colbert,It Happened One Night
9,1935,Bette Davis,Dangerous


In [30]:
# Now check the year 1968 and see two actresses and films
df_actresses.loc[df_actresses['Year'] == 1968]

Unnamed: 0,Year,Actress,Film
42,1968,Katharine Hepburn,The Lion in Winter
43,1968,Barbra Streisand,Funny Girl


In [31]:
# Page 5 - Best Supporting Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [32]:
# Here we extract each actor and film
films =[]
actors = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style=re.compile('^'+winnerStyle))   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actor = l.find(text=True).encode('utf-8').strip()
                    actors.append(actor)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1  

In [33]:
# Here we extract the years. Because these awards started in 1936 all the years are in the correct format
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a') 
    if(len(cells) > 1):        
        year = cells[0].find(text=True)[0:4] 
        years.append(year)

In [34]:
# Check all lists are the same length, they are
print(len(actors))
print(len(films))
print(len(years))

82
82
82


In [35]:
# Create a data frame for storing the lists
df_supActors = pd.DataFrame()
df_supActors["Year"] = pd.to_numeric(years, errors='coerce')
df_supActors["Actor"] = actors
df_supActors["Film"] = films
# Print data frame
df_supActors

Unnamed: 0,Year,Actor,Film
0,1936,Walter Brennan,Come and Get It
1,1937,Joseph Schildkraut,The Life of Emile Zola
2,1938,Walter Brennan,Kentucky
3,1939,Thomas Mitchell,Stagecoach
4,1940,Walter Brennan,The Westerner
5,1941,Donald Crisp,How Green Was My Valley
6,1942,Van Heflin,Johnny Eager
7,1943,Charles Coburn,The More the Merrier
8,1944,Barry Fitzgerald,Going My Way
9,1945,James Dunn,A Tree Grows in Brooklyn


In [36]:
# Page 6 - Best Supporting Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [37]:
# Here we extract each actress and film
films =[]
actresses = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style=re.compile('^'+winnerStyle))   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actress = l.find(text=True).encode('utf-8').strip()
                    actresses.append(actress)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1  

In [38]:
# Here we extract the years. Because these awards started in 1936 all the years are in the correct format
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')  
    if(len(cells) > 1):
        year = cells[0].find(text=True)
        years.append(year)

In [39]:
# Check all the lists are the same length, they are
print(len(actresses))
print(len(films))
print(len(years))

82
82
82


In [40]:
# Create a data frame for storing the lists
df_supActresses = pd.DataFrame()
df_supActresses["Year"] = pd.to_numeric(years, errors='coerce')
df_supActresses["Actress"] = actresses
df_supActresses["Film"] = films
# Print data frame
df_supActresses

Unnamed: 0,Year,Actress,Film
0,1936,Gale Sondergaard,Anthony Adverse
1,1937,Alice Brady,In Old Chicago
2,1938,Fay Bainter,Jezebel
3,1939,Hattie McDaniel,Gone with the Wind
4,1940,Jane Darwell,The Grapes of Wrath
5,1941,Mary Astor,The Great Lie
6,1942,Teresa Wright,Mrs. Miniver
7,1943,Katina Paxinou,For Whom the Bell Tolls
8,1944,Ethel Barrymore,None but the Lonely Heart
9,1945,Anne Revere,National Velvet


In [41]:
# Print the number of rows in each data frame
print(df_picture.shape[0])
print(df_directors.shape[0])
print(df_actors.shape[0])
print(df_actresses.shape[0])
print(df_supActors.shape[0])
print(df_supActresses.shape[0])

90
91
92
93
82
82


In [42]:
# Now all the data has been read into data frames, the next step is to connect to MYSQL and store it in the database
from sqlalchemy import create_engine    # for connecting to MySQL
from MySQL_connect import config        # import MySQL_connect.py for connection parameters

# use the parameters from file to create connection variables
user = config['user']
password = config['password']
host = config['host']
db = config['db']

# Connection object for MySQL
engine = create_engine("mysql+mysqldb://"+user+":"+password+"@"+host+"/"+db+"?charset=utf8")

In [43]:
# Insert the data from each data frame into tables. If the table already exists overwrite it
df_picture.to_sql('best_picture', con=engine, if_exists='replace', index_label='id')
df_directors.to_sql('best_director', con=engine, if_exists='replace', index_label='id')
df_actors.to_sql('best_actor', con=engine, if_exists='replace', index_label='id')
df_actresses.to_sql('best_actress', con=engine, if_exists='replace', index_label='id')
df_supActors.to_sql('best_supporting_actor', con=engine, if_exists='replace', index_label='id')
df_supActresses.to_sql('best_supporting_actress', con=engine, if_exists='replace', index_label='id')

In [44]:
# Show that the tables have been created
res = engine.execute("SHOW TABLES")
for x in res:
    print(x)

(u'best_actor',)
(u'best_actress',)
(u'best_director',)
(u'best_picture',)
(u'best_supporting_actor',)
(u'best_supporting_actress',)


In [45]:
# Now lets run some queries on the tables
# First return the number of row in each table
num_res = engine.execute("SELECT COUNT(*) FROM best_picture")
print("Best Picture number of rows")
for x in num_res:
    print(x)
num_res = engine.execute("SELECT COUNT(*) FROM best_director")
print("Best Director number of rows")
for x in num_res:
    print(x)
num_res = engine.execute("SELECT COUNT(*) FROM best_actor")
print("Best Actor number of rows")
for x in num_res:
    print(x)
num_res = engine.execute("SELECT COUNT(*) FROM best_actress")
print("Best Actress number of rows")
for x in num_res:
    print(x)
num_res = engine.execute("SELECT COUNT(*) FROM best_supporting_actor")
print("Best Supporting Actor number of rows")
for x in num_res:
    print(x)
num_res = engine.execute("SELECT COUNT(*) FROM best_supporting_actress")
print("Best Supporting Actress number of rows")
for x in num_res:
    print(x)

Best Picture number of rows
(90L,)
Best Director number of rows
(91L,)
Best Actor number of rows
(92L,)
Best Actress number of rows
(93L,)
Best Supporting Actor number of rows
(82L,)
Best Supporting Actress number of rows
(82L,)


In [46]:
from prettytable import PrettyTable    # allow us to output results of queries in table format

In [47]:
# Query: Return a list of actors who have won an Oscar. In descending order of wins
# Explained: We are using two tables - best_actor and best_supporting_actor. Therefore we use UNION inside a subquery to 
# combine the results of two queries into one set. Each query counts the total for each actor so we sum both totals to get 
# the total number of wins for each actor.
actor_mostWins = engine.execute("SELECT actor, SUM(total_wins) AS total_wins \
FROM (SELECT actor, COUNT(actor) AS total_wins \
FROM best_actor GROUP BY actor \
UNION SELECT actor, COUNT(actor) AS total_wins \
FROM best_supporting_actor GROUP BY actor) AS res \
GROUP BY actor ORDER BY total_wins DESC, actor ASC")

table = PrettyTable(['Actor', 'Wins'])
for x in actor_mostWins:
    table.add_row([x['actor'], x['total_wins']])
print(table)

+------------------------+------+
|         Actor          | Wins |
+------------------------+------+
|    Daniel Day-Lewis    |  3   |
|     Jack Nicholson     |  3   |
|     Walter Brennan     |  3   |
|     Anthony Quinn      |  2   |
|    Christoph Waltz     |  2   |
|     Dustin Hoffman     |  2   |
|     Emil Jannings      |  2   |
|     Fredric March      |  2   |
|      Gary Cooper       |  2   |
|     Jason Robards      |  2   |
|     Marlon Brando      |  2   |
|     Melvyn Douglas     |  2   |
|     Michael Caine      |  2   |
|     Peter Ustinov      |  2   |
|       Sean Penn        |  2   |
|     Spencer Tracy      |  2   |
|       Tom Hanks        |  2   |
|      Adrien Brody      |  1   |
|       Al Pacino        |  1   |
|       Alan Arkin       |  1   |
|     Alec Guinness      |  1   |
|    Anthony Hopkins     |  1   |
|       Art Carney       |  1   |
|    Barry Fitzgerald    |  1   |
|      Ben Johnson       |  1   |
|      Ben Kingsley      |  1   |
|    Benicio d

In [48]:
# Query: Return a list of actresses who have won an Oscar. In descending order of wins
# Explained: We are using two tables - best_actress and best_supporting_actress. Therefore we use UNION inside a subquery to 
# combine the results of two queries into one set. Each query counts the total for each actress so we sum both totals to get 
# the total number of wins for each actress.
actress_mostWins = engine.execute("SELECT actress, SUM(total_wins) AS total_wins \
FROM ( SELECT actress, COUNT(actress) AS total_wins \
FROM best_actress GROUP BY actress \
UNION SELECT actress, COUNT(actress) AS total_wins \
FROM best_supporting_actress GROUP BY actress) AS res \
GROUP BY actress ORDER BY total_wins DESC, actress ASC")

table = PrettyTable(['Actress', 'Wins'])
for x in actress_mostWins:
    table.add_row([x['actress'], x['total_wins']])
print(table)

+----------------------+------+
|       Actress        | Wins |
+----------------------+------+
|  Katharine Hepburn   |  4   |
|    Ingrid Bergman    |  3   |
|     Janet Gaynor     |  3   |
|     Meryl Streep     |  3   |
|     Bette Davis      |  2   |
|     Dianne Wiest     |  2   |
|   Elizabeth Taylor   |  2   |
|  Frances McDormand   |  2   |
|    Glenda Jackson    |  2   |
|     Hilary Swank     |  2   |
|      Jane Fonda      |  2   |
|     Jodie Foster     |  2   |
|     Luise Rainer     |  2   |
| Olivia de Havilland  |  2   |
|     Sally Field      |  2   |
|   Shelley Winters    |  2   |
|     Vivien Leigh     |  2   |
|     Alice Brady      |  1   |
|   Alicia Vikander    |  1   |
|    Allison Janney    |  1   |
|    Angelina Jolie    |  1   |
|   Anjelica Huston    |  1   |
|     Anna Magnani     |  1   |
|     Anna Paquin      |  1   |
|    Anne Bancroft     |  1   |
|     Anne Baxter      |  1   |
|    Anne Hathaway     |  1   |
|     Anne Revere      |  1   |
|    Aud

In [49]:
# Query: Return a list of directors who have won an Oscar. In descending order of wins
# Explained: Because the 1st Osars had winners in different categories we have to use substring to get rid of any brackets at
# the end of any names. With the names correct we can now get the total for each director
director_mostWins = engine.execute("SELECT director, SUM(num_wins) AS total_wins \
FROM (SELECT IF(SUBSTRING(director, LENGTH(director)) = ')', \
SUBSTRING(director, 1, POSITION('(' IN director) -1), director) AS director, \
COUNT( IF(SUBSTRING(director, LENGTH(director)) = ')', \
substring(director, 1, POSITION('(' IN Director) -1), Director)) AS num_wins \
FROM best_director GROUP BY Director ) AS res \
GROUP BY Director ORDER BY total_wins DESC, director ASC");

table = PrettyTable(['Director', 'Wins'])
for x in director_mostWins:
    table.add_row([x['director'], x['total_wins']])
print(table)

+------------------------------+------+
|           Director           | Wins |
+------------------------------+------+
|          John Ford           |  4   |
|         Frank Capra          |  3   |
|        William Wyler         |  3   |
|    Alejandro G. Iñárritu     |  2   |
|           Ang Lee            |  2   |
|         Billy Wilder         |  2   |
|        Clint Eastwood        |  2   |
|          David Lean          |  2   |
|          Elia Kazan          |  2   |
|        Frank Borzage         |  2   |
|         Frank Lloyd          |  2   |
|        Fred Zinnemann        |  2   |
|        George Stevens        |  2   |
|     Joseph L. Mankiewicz     |  2   |
|         Leo McCarey          |  2   |
|       Lewis Milestone        |  2   |
|         Miloš Forman         |  2   |
|         Oliver Stone         |  2   |
|       Steven Spielberg       |  2   |
|        Alfonso Cuarón        |  1   |
|      Anthony Minghella       |  1   |
|        Barry Levinson        |  1   |


  cursor.execute(statement, parameters)


In [50]:
# Query: Return a list of best picture winning directors/companies in descending order of total wins
# Explained: This query is difficult because the nominee column can have multiple names in it. We are interested in the first
# name as that is usually the director. The names can be seperated by either commas or 'and' so we use search for the position 
# of the first comma. If a comma is found then use that as the maximum index for the substring. If not then search for the 
# position of ' and ' and use that as the maximum index. Finally return the substring and use it for the count.
picture_mostWins = engine.execute("SELECT \
IF (POSITION(',' IN nominee) > 0, SUBSTRING(nominee, 1, POSITION(',' IN nominee) -1) , \
IF(POSITION(' and ' IN nominee) > 0, \
SUBSTRING(nominee, 1, POSITION(' and ' IN nominee)), nominee) \
) AS director, COUNT(nominee) AS total_wins \
FROM best_picture GROUP BY director ORDER BY total_wins DESC, director ASC")

table = PrettyTable(['Director/Company', 'Wins'])
for x in picture_mostWins:
    table.add_row([x['director'], x['total_wins']])
print(table)

+---------------------------------+------+
|         Director/Company        | Wins |
+---------------------------------+------+
|       Metro-Goldwyn-Mayer       |  5   |
|         20th Century-Fox        |  3   |
|           Sam Spiegel           |  3   |
|           Arthur Freed          |  2   |
|          Clint Eastwood         |  2   |
|             Columbia            |  2   |
|            Paramount            |  2   |
|           Robert Wise           |  2   |
|           Saul Zaentz           |  2   |
| Selznick International Pictures |  2   |
|           Warner Bros.          |  2   |
|          Adele Romanski         |  1   |
|         Albert S. Ruddy         |  1   |
|      Alejandro G. Iñárritu      |  1   |
|         Arnold Kopelson         |  1   |
|        Barrie M. Osborne        |  1   |
|          Barry Spikings         |  1   |
|           Billy Wilder          |  1   |
|         Blye Pagon Faust        |  1   |
|            Brad Pitt            |  1   |
|          