In [1]:
# This project involves using Beautiful Soup to extract information relating to Oscar winners from Wikipedia pages. In total 
# six different pages will be accessed in order to obtain the following:
# 1. Best Picture 
# 2. Best Director
# 3. Best Actor
# 4. Best Actress
# 5. Best Supporting Actor
# 6. Best Supporting Actress
# Once the data has been retrieved from each page, it will be stored in dataframes. I will then connect to MySQL, create tables
# and insert the data there. Finally I will run queries on the database to provide useful insight.



In [2]:
import urllib2;                    # for retrieving the contents of a page
from bs4 import BeautifulSoup;     # for parsing the contents of a page to HTML
import pandas as pd                # for creating dataframes

In [3]:
# function that takes in a URL and returns the contents of the page
def loadPage(url):
    page = urllib2.urlopen(url)
    data = BeautifulSoup(page, "lxml")
    return data

In [4]:
# Page 1- Best Picture
# Call function to read in URL and retrieve data from the page
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
# from the data get all the tables that have the following class
tables = data.findAll("table", class_= "wikitable")

In [5]:
# There are tables for each decade starting from the 1920's. We want to go through each table and extract the appropriate information
# Create empty lists for each film and nominee
films = []
nominees = []
# There are ten tables so we use range to keep track 
for i in range(0, 11):
    # For each table get each row with the following style
    for row in tables[i].findAll("tr", style="background:#FAEB86"):
        # Get each cell within that row
        cells = row.findAll('td')
        # Get the text from within the first cell, encode it to utf-8 and add it to the films list
        if(len(cells) > 0):        
            film = cells[0].find(text=True).encode('utf-8').strip()
            films.append(film)
            # Get the text from within the second cell, encode it and add it to the list of nominees
        if(len(cells) > 1):      
            nominee = cells[1].findAll(text=True)[:-1]
            nominees.append("".join(nominee).encode('utf-8').strip() )    

In [6]:
# Here we extract the years. Years can be different formats depending on how far back they are ie. 1927/28, 1927/1928, 1928
# Create an empty list for each year
years = []
for i in range(0, 11):  
    # For each table row get each cell with the following style
    for row in tables[i].findAll("td", style="text-align:center"):
            # Get each link within that row
            cells = row.findAll('a')
            # If year is all in one link and in a format like 1927/28 then we want the first two and last two charaters so 
            # that it will be 1928
            if(len(cells[0].find(text=True)) == 7):
                year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
            # If year is in two links and the second link has four numbers ie. 1927/1928 then we want the second link
            elif(len(cells[1].find(text=True)) == 4):       
                if (cells[1].find(text=True).isnumeric()):
                    year = cells[1].find(text=True)
            # If the year is in two links and the second link has two numbers ie. 1927/28 then we want the first two characters
            # of the first link and the last two of the second link
            elif(len(cells[1].find(text=True)) == 2):
                if (cells[1].find(text=True).isnumeric()):
                    year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
            # Else the year is in a standard format ie. 1928 so get four characters
            else:
                year = cells[0].find(text=True)[0:4]
            # add year to list of years
            years.append(year)

In [7]:
# Check the length of each list. They are all the same
print(len(films))
print(len(nominees))
print(len(years))

90
90
90


In [8]:
# Create a data frame for storing each list
df_picture = pd.DataFrame()
df_picture["Year"] = pd.to_numeric(years, errors='coerce')
df_picture["Film"] = films
df_picture["Nominee"] = nominees
df_picture

Unnamed: 0,Year,Film,Nominee
0,1928,Wings,Paramount Famous Lasky
1,1929,The Broadway Melody,Metro-Goldwyn-Mayer
2,1930,All Quiet on the Western Front,Universal
3,1931,Cimarron,RKO Radio
4,1932,Grand Hotel,Metro-Goldwyn-Mayer
5,1933,Cavalcade,Fox
6,1934,It Happened One Night,Columbia
7,1935,Mutiny on the Bounty,Metro-Goldwyn-Mayer
8,1936,The Great Ziegfeld,Metro-Goldwyn-Mayer
9,1937,The Life of Emile Zola,Warner Bros.


In [9]:
# Page2 - Best Director
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director")
tables = data.findAll("table", class_= "wikitable sortable")

In [10]:
# Here we extract each film and its director. All the data is stored in one table.
# Create empty lists for storing each film and director
films =[]
directors = []
# Get each row within the table
for row in tables[0].findAll("tr"):
        # Get each cell within that row with the following style
        cells = row.findAll("td", style="background:#FAEB86;")
        # For the first year there was two categories of winners so we want to include that information. It is inside a span tag.
        # FindAll() gets all the text within the cell including child tags. 
        #Finally use join() to merge the text into one string and add it to the list of direcors
        if(len(cells) > 0):
            director = cells[0].findAll(text=True)[:-1]
            directors.append("".join(director).encode('utf-8').strip())  
        # Get the film from the second cell and add it to the list of films
        if(len(cells) > 1):
            film = cells[1].find(text=True).encode('utf-8').strip() 
            films.append(film)

In [11]:
# Here we extract the years.
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4]
    years.append(year)

In [12]:
# Check the length of each list. Due to the 1st Oscars having multiple awards they are not the same length
print(len(films))
print(len(directors))
print(len(years))

91
91
90


In [13]:
# To amend this lets insert a year at the position 1 with the same value as the position 0
years.insert(1, years[0])

In [14]:
# Now they are all the same length
print(len(films))
print(len(directors))
print(len(years))

91
91
91


In [15]:
# Create a data frame for storing each list and print it
df_directors = pd.DataFrame()
df_directors["Year"] = pd.to_numeric(years, errors='coerce')
df_directors["Director"] = directors
df_directors["Film"] = films
# Look at the data frame.
df_directors

Unnamed: 0,Year,Director,Film
0,1928,Frank Borzage (Dramatic),7th Heaven
1,1928,Lewis Milestone (Comedy),Two Arabian Knights
2,1929,Frank Lloyd,The Divine Lady
3,1930,Lewis Milestone,All Quiet on the Western Front
4,1931,Norman Taurog,Skippy
5,1932,Frank Borzage,Bad Girl
6,1933,Frank Lloyd,Cavalcade
7,1934,Frank Capra,It Happened One Night
8,1935,John Ford,The Informer
9,1936,Frank Capra,Mr. Deeds Goes to Town


In [16]:
# Page 3- Best Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [17]:
# Here we extract each actor and film
films =[]
actors = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style="background:#FAEB86;")    
    # Index variable to keep track of the cell we are in
    index = 0
    for c in cells:
        # Get all the links within each cell
        links = c.findAll("a")
        for l in links:
            # If link has the attribute 'title' then we want it
            if(l.has_attr('title')):
                # If it is the first cell then get the text and add it to the list of actors
                if(index == 0): 
                    actor = l.find(text=True).encode('utf-8').strip()
                    actors.append(actor)
                # If it is the third cell then get the text and add it to the list of films
                elif(index == 2): 
                    film = l.find(text=True).encode('utf-8').strip()
                    films.append(film)
        # increment index
        index = index+1         
        

In [18]:
# Here we extract the years
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4]
    years.append(year)

In [19]:
# Check the length of each list. They are not the same. There are two reasons for this.
# 1. In the 1st Oscars the actor won awards for two films
# 2. In 1932 there was a tie for 1st place
print(len(films))
print(len(actors))
print(len(years))

92
91
90


In [20]:
# To amend this lets insert the actor at position 0 into position 1 and insert the appropriate years at position 1 and 6 of years
actors.insert(1, actors[0])
years.insert(1, years[0])
years.insert(6, years[5])

In [21]:
# Now they are all the same length
print(len(films))
print(len(actors))
print(len(years))

92
92
92


In [22]:
# Create a data frame for storing the lists
df_actors = pd.DataFrame()
df_actors["Year"] = pd.to_numeric(years, errors='coerce')
df_actors["Actor"] = actors
df_actors["Film"] = films
# Print the data frame and look at years 1928 and 1932
df_actors

Unnamed: 0,Year,Actor,Film
0,1928,Emil Jannings,The Last Command
1,1928,Emil Jannings,The Way of All Flesh
2,1929,Warner Baxter,In Old Arizona
3,1930,George Arliss,Disraeli
4,1931,Lionel Barrymore,A Free Soul
5,1932,Wallace Beery,The Champ
6,1932,Fredric March,Dr. Jekyll and Mr. Hyde
7,1933,Charles Laughton,The Private Life of Henry VIII
8,1934,Clark Gable,It Happened One Night
9,1935,Victor McLaglen,The Informer


In [23]:
# Page 4- Best Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [24]:
# Here we extract each actress and film
films =[]
actresses = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style="background:#FAEB86;")   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actress = l.find(text=True).encode('utf-8').strip()
                    actresses.append(actress)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1         

In [25]:
# Here we extract each year
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')     
    if(len(cells[0].find(text=True)) == 7):
        year = cells[0].find(text=True)[0:2] + cells[0].find(text=True)[5:7]
    elif(len(cells[1].find(text=True)) == 4):       
        if (cells[1].find(text=True).isnumeric()):
            year = cells[1].find(text=True)  
    elif(len(cells[1].find(text=True)) == 2):
        if (cells[1].find(text=True).isnumeric()):
            year = cells[0].find(text=True)[0:2] + cells[1].find(text=True)
    else:
        year = cells[0].find(text=True)[0:4] 
    years.append(year)

In [26]:
# Check the length of each list. They are not the same. There are two reasons for this.
# 1. In the 1st Oscars the actress won awards for three films
# 2. In 1968 there was a tie for 1st place
print(len(actresses))
print(len(films))
print(len(years))

91
93
90


In [27]:
# Lets amend this by inserting the actress at the position 0 to position 1 and 2. Do the same for the years
# Also insert the year at position 40 into position 41
actresses.insert(1, actresses[0])
actresses.insert(2, actresses[1])
years.insert(41, years[40])
years.insert(1, years[0])
years.insert(2, years[1])

In [28]:
# Now they are all the same length
print(len(actresses))
print(len(films))
print(len(years))

93
93
93


In [29]:
# Create a data frame for storing the lists
df_actresses = pd.DataFrame()
df_actresses["Year"] = pd.to_numeric(years, errors='coerce')
df_actresses["Actress"] = actresses
df_actresses["Film"] = films
# Print the dataframe and look at the first three indexes which have the same year and actress but different films
df_actresses

Unnamed: 0,Year,Actress,Film
0,1928,Janet Gaynor,7th Heaven
1,1928,Janet Gaynor,Street Angel
2,1928,Janet Gaynor,Sunrise: A Song of Two Humans
3,1929,Mary Pickford,Coquette
4,1930,Norma Shearer,The Divorcee
5,1931,Marie Dressler,Min and Bill
6,1932,Helen Hayes,The Sin of Madelon Claudet
7,1933,Katharine Hepburn,Morning Glory
8,1934,Claudette Colbert,It Happened One Night
9,1935,Bette Davis,Dangerous


In [30]:
# Now check the year 1968 and see two actresses and films
df_actresses.loc[df_actresses['Year'] == 1968]

Unnamed: 0,Year,Actress,Film
42,1968,Katharine Hepburn,The Lion in Winter
43,1968,Barbra Streisand,Funny Girl


In [31]:
# Page 5 - Best Supporting Actor
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor")
tables = data.findAll("table", class_= "wikitable sortable")

In [32]:
# Here we extract each actor and film
films =[]
actors = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style="background:#FAEB86")   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actor = l.find(text=True).encode('utf-8').strip()
                    actors.append(actor)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1  

In [33]:
# Here we extract the years. Because these awards started in 1936 all the years are in the correct format
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a') 
    if(len(cells) > 1):        
        year = cells[0].find(text=True)[0:4] 
        years.append(year)

In [34]:
# Check all lists are the same length, they are
print(len(actors))
print(len(films))
print(len(years))

82
82
82


In [35]:
# Create a data frame for storing the lists
df_supActors = pd.DataFrame()
df_supActors["Year"] = pd.to_numeric(years, errors='coerce')
df_supActors["Actor"] = actors
df_supActors["Film"] = films
# Print data frame
df_supActors

Unnamed: 0,Year,Actor,Film
0,1936,Walter Brennan,Come and Get It
1,1937,Joseph Schildkraut,The Life of Emile Zola
2,1938,Walter Brennan,Kentucky
3,1939,Thomas Mitchell,Stagecoach
4,1940,Walter Brennan,The Westerner
5,1941,Donald Crisp,How Green Was My Valley
6,1942,Van Heflin,Johnny Eager
7,1943,Charles Coburn,The More the Merrier
8,1944,Barry Fitzgerald,Going My Way
9,1945,James Dunn,A Tree Grows in Brooklyn


In [36]:
# Page 6 - Best Supporting Actress
data = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actress")
tables = data.findAll("table", class_= "wikitable sortable")

In [37]:
# Here we extract each actress and film
films =[]
actresses = []
for row in tables[0].findAll("tr"):
    cells = row.findAll("td", style="background:#FAEB86;")   
    index = 0
    for c in cells:
        links = c.findAll("a")
        for l in links:
            if(l.has_attr('title')):
                if(index == 0): 
                    actress = l.find(text=True).encode('utf-8').strip()
                    actresses.append(actress)
                elif(index == 2): 
                    movie = l.find(text=True).encode('utf-8').strip()
                    films.append(movie)
        index = index+1  

In [38]:
# Here we extract the years. Because these awards started in 1936 all the years are in the correct format
years = []
for row in tables[0].findAll("th", scope="row"):
    cells = row.findAll('a')  
    if(len(cells) > 1):
        year = cells[0].find(text=True)
        years.append(year)

In [39]:
# Check all the lists are the same length, they are
print(len(actresses))
print(len(films))
print(len(years))

82
82
82


In [40]:
# Create a data frame for storing the lists
df_supActresses = pd.DataFrame()
df_supActresses["Year"] = pd.to_numeric(years, errors='coerce')
df_supActresses["Actress"] = actresses
df_supActresses["Film"] = films
# Print data frame
df_supActresses

Unnamed: 0,Year,Actress,Film
0,1936,Gale Sondergaard,Anthony Adverse
1,1937,Alice Brady,In Old Chicago
2,1938,Fay Bainter,Jezebel
3,1939,Hattie McDaniel,Gone with the Wind
4,1940,Jane Darwell,The Grapes of Wrath
5,1941,Mary Astor,The Great Lie
6,1942,Teresa Wright,Mrs. Miniver
7,1943,Katina Paxinou,For Whom the Bell Tolls
8,1944,Ethel Barrymore,None but the Lonely Heart
9,1945,Anne Revere,National Velvet


In [41]:
# Print the number of rows in each data frame
print(df_picture.shape[0])
print(df_directors.shape[0])
print(df_actors.shape[0])
print(df_actresses.shape[0])
print(df_supActors.shape[0])
print(df_supActresses.shape[0])

90
91
92
93
82
82


In [42]:
# Now all the data has been read into data frames, the next step is to connect to MYSQL and store it in the database
from sqlalchemy import create_engine    # for connecting to MySQL

# The following variables are used to connect to MySQL. Set your own values for user, password and db
user = ''
password = ''
host = 'localhost'
db = ''

# Connection object for MySQL
engine = create_engine("mysql+mysqldb://"+user+":"+password+"@"+host+"/"+db+"?charset=utf8")

In [43]:
# Insert the data from each data frame into tables. If the table already exists overwrite it
df_picture.to_sql('best_picture', con=engine, if_exists='replace', index_label='id')
df_directors.to_sql('best_director', con=engine, if_exists='replace', index_label='id')
df_actors.to_sql('best_actor', con=engine, if_exists='replace', index_label='id')
df_actresses.to_sql('best_actress', con=engine, if_exists='replace', index_label='id')
df_supActors.to_sql('best_supporting_actor', con=engine, if_exists='replace', index_label='id')
df_supActresses.to_sql('best_supporting_actress', con=engine, if_exists='replace', index_label='id')

In [44]:
# Show that the tables havebeen created
res = engine.execute("SHOW TABLES")
for x in res:
    print x

(u'best_actor',)
(u'best_actress',)
(u'best_director',)
(u'best_picture',)
(u'best_supporting_actor',)
(u'best_supporting_actress',)
(u'customers',)
(u'products',)
(u'users',)


In [45]:
# Now lets run some queries on the tables
# First return the number of row in each table
num_res = engine.execute("SELECT COUNT(*) FROM best_picture")
print("Best Picture number of rows")
for x in num_res:
    print x
num_res = engine.execute("SELECT COUNT(*) FROM best_director")
print("Best Director number of rows")
for x in num_res:
    print x
num_res = engine.execute("SELECT COUNT(*) FROM best_actor")
print("Best Actor number of rows")
for x in num_res:
    print x
num_res = engine.execute("SELECT COUNT(*) FROM best_actress")
print("Best Actress number of rows")
for x in num_res:
    print x
num_res = engine.execute("SELECT COUNT(*) FROM best_supporting_actor")
print("Best Supporting Actor number of rows")
for x in num_res:
    print x
num_res = engine.execute("SELECT COUNT(*) FROM best_supporting_actress")
print("Best Supporting Actress number of rows")
for x in num_res:
    print x

Best Picture number of rows
(90L,)
Best Director number of rows
(91L,)
Best Actor number of rows
(92L,)
Best Actress number of rows
(93L,)
Best Supporting Actor number of rows
(82L,)
Best Supporting Actress number of rows
(82L,)


In [46]:
# Return all the rows from each table
table_pictures = engine.execute("SELECT * FROM best_picture")
for x in table_pictures:
    print x

(0L, 1928L, u'Wings', u'Paramount Famous Lasky')
(1L, 1929L, u'The Broadway Melody', u'Metro-Goldwyn-Mayer')
(2L, 1930L, u'All Quiet on the Western Front', u'Universal')
(3L, 1931L, u'Cimarron', u'RKO Radio')
(4L, 1932L, u'Grand Hotel', u'Metro-Goldwyn-Mayer')
(5L, 1933L, u'Cavalcade', u'Fox')
(6L, 1934L, u'It Happened One Night', u'Columbia')
(7L, 1935L, u'Mutiny on the Bounty', u'Metro-Goldwyn-Mayer')
(8L, 1936L, u'The Great Ziegfeld', u'Metro-Goldwyn-Mayer')
(9L, 1937L, u'The Life of Emile Zola', u'Warner Bros.')
(10L, 1938L, u"You Can't Take It with You", u'Columbia')
(11L, 1939L, u'Gone with the Wind', u'Selznick International Pictures')
(12L, 1940L, u'Rebecca', u'Selznick International Pictures')
(13L, 1941L, u'How Green Was My Valley', u'20th Century-Fox')
(14L, 1942L, u'Mrs. Miniver', u'Metro-Goldwyn-Mayer')
(15L, 1943L, u'Casablanca', u'Warner Bros.')
(16L, 1944L, u'Going My Way', u'Paramount')
(17L, 1945L, u'The Lost Weekend', u'Paramount')
(18L, 1946L, u'The Best Years of Ou

In [47]:
table_directors = engine.execute("SELECT * FROM best_director")
for x in table_directors:
    print x

(0L, 1928L, u'Frank Borzage (Dramatic)', u'7th Heaven')
(1L, 1928L, u'Lewis Milestone (Comedy)', u'Two Arabian Knights')
(2L, 1929L, u'Frank Lloyd', u'The Divine Lady')
(3L, 1930L, u'Lewis Milestone', u'All Quiet on the Western Front')
(4L, 1931L, u'Norman Taurog', u'Skippy')
(5L, 1932L, u'Frank Borzage', u'Bad Girl')
(6L, 1933L, u'Frank Lloyd', u'Cavalcade')
(7L, 1934L, u'Frank Capra', u'It Happened One Night')
(8L, 1935L, u'John Ford', u'The Informer')
(9L, 1936L, u'Frank Capra', u'Mr. Deeds Goes to Town')
(10L, 1937L, u'Leo McCarey', u'The Awful Truth')
(11L, 1938L, u'Frank Capra', u"You Can't Take It with You")
(12L, 1939L, u'Victor Fleming', u'Gone with the Wind')
(13L, 1940L, u'John Ford', u'The Grapes of Wrath')
(14L, 1941L, u'John Ford', u'How Green Was My Valley')
(15L, 1942L, u'William Wyler', u'Mrs. Miniver')
(16L, 1943L, u'Michael Curtiz', u'Casablanca')
(17L, 1944L, u'Leo McCarey', u'Going My Way')
(18L, 1945L, u'Billy Wilder', u'The Lost Weekend')
(19L, 1946L, u'William W

In [48]:
table_actors = engine.execute("SELECT * FROM best_actor")
for x in table_actors:
    print x

(0L, 1928L, u'Emil Jannings', u'The Last Command')
(1L, 1928L, u'Emil Jannings', u'The Way of All Flesh')
(2L, 1929L, u'Warner Baxter', u'In Old Arizona')
(3L, 1930L, u'George Arliss', u'Disraeli')
(4L, 1931L, u'Lionel Barrymore', u'A Free Soul')
(5L, 1932L, u'Wallace Beery', u'The Champ')
(6L, 1932L, u'Fredric March', u'Dr. Jekyll and Mr. Hyde')
(7L, 1933L, u'Charles Laughton', u'The Private Life of Henry VIII')
(8L, 1934L, u'Clark Gable', u'It Happened One Night')
(9L, 1935L, u'Victor McLaglen', u'The Informer')
(10L, 1936L, u'Paul Muni', u'The Story of Louis Pasteur')
(11L, 1937L, u'Spencer Tracy', u'Captains Courageous')
(12L, 1938L, u'Spencer Tracy', u'Boys Town')
(13L, 1939L, u'Robert Donat', u'Goodbye, Mr. Chips')
(14L, 1940L, u'James Stewart', u'The Philadelphia Story')
(15L, 1941L, u'Gary Cooper', u'Sergeant York')
(16L, 1942L, u'James Cagney', u'Yankee Doodle Dandy')
(17L, 1943L, u'Paul Lukas', u'Watch on the Rhine')
(18L, 1944L, u'Bing Crosby', u'Going My Way')
(19L, 1945L, 

In [49]:
table_actresses = engine.execute("SELECT * FROM best_actress")
for x in table_actresses:
    print x

(0L, 1928L, u'Janet Gaynor', u'7th Heaven')
(1L, 1928L, u'Janet Gaynor', u'Street Angel')
(2L, 1928L, u'Janet Gaynor', u'Sunrise: A Song of Two Humans')
(3L, 1929L, u'Mary Pickford', u'Coquette')
(4L, 1930L, u'Norma Shearer', u'The Divorcee')
(5L, 1931L, u'Marie Dressler', u'Min and Bill')
(6L, 1932L, u'Helen Hayes', u'The Sin of Madelon Claudet')
(7L, 1933L, u'Katharine Hepburn', u'Morning Glory')
(8L, 1934L, u'Claudette Colbert', u'It Happened One Night')
(9L, 1935L, u'Bette Davis', u'Dangerous')
(10L, 1936L, u'Luise Rainer', u'The Great Ziegfeld')
(11L, 1937L, u'Luise Rainer', u'The Good Earth')
(12L, 1938L, u'Bette Davis', u'Jezebel')
(13L, 1939L, u'Vivien Leigh', u'Gone with the Wind')
(14L, 1940L, u'Ginger Rogers', u'Kitty Foyle')
(15L, 1941L, u'Joan Fontaine', u'Suspicion')
(16L, 1942L, u'Greer Garson', u'Mrs. Miniver')
(17L, 1943L, u'Jennifer Jones', u'The Song of Bernadette')
(18L, 1944L, u'Ingrid Bergman', u'Gaslight')
(19L, 1945L, u'Joan Crawford', u'Mildred Pierce')
(20L, 1

In [50]:
table_supActors = engine.execute("SELECT * FROM best_supporting_actor")
for x in table_supActors:
    print x

(0L, 1936L, u'Walter Brennan', u'Come and Get It')
(1L, 1937L, u'Joseph Schildkraut', u'The Life of Emile Zola')
(2L, 1938L, u'Walter Brennan', u'Kentucky')
(3L, 1939L, u'Thomas Mitchell', u'Stagecoach')
(4L, 1940L, u'Walter Brennan', u'The Westerner')
(5L, 1941L, u'Donald Crisp', u'How Green Was My Valley')
(6L, 1942L, u'Van Heflin', u'Johnny Eager')
(7L, 1943L, u'Charles Coburn', u'The More the Merrier')
(8L, 1944L, u'Barry Fitzgerald', u'Going My Way')
(9L, 1945L, u'James Dunn', u'A Tree Grows in Brooklyn')
(10L, 1946L, u'Harold Russell', u'The Best Years of Our Lives')
(11L, 1947L, u'Edmund Gwenn', u'Miracle on 34th Street')
(12L, 1948L, u'Walter Huston', u'The Treasure of the Sierra Madre')
(13L, 1949L, u'Dean Jagger', u"Twelve O'Clock High")
(14L, 1950L, u'George Sanders', u'All About Eve')
(15L, 1951L, u'Karl Malden', u'A Streetcar Named Desire')
(16L, 1952L, u'Anthony Quinn', u'Viva Zapata!')
(17L, 1953L, u'Frank Sinatra', u'From Here to Eternity')
(18L, 1954L, u"Edmond O'Brien

In [51]:
table_supActresses = engine.execute("SELECT * FROM best_supporting_actress")
for x in table_supActresses:
    print x

(0L, 1936L, u'Gale Sondergaard', u'Anthony Adverse')
(1L, 1937L, u'Alice Brady', u'In Old Chicago')
(2L, 1938L, u'Fay Bainter', u'Jezebel')
(3L, 1939L, u'Hattie McDaniel', u'Gone with the Wind')
(4L, 1940L, u'Jane Darwell', u'The Grapes of Wrath')
(5L, 1941L, u'Mary Astor', u'The Great Lie')
(6L, 1942L, u'Teresa Wright', u'Mrs. Miniver')
(7L, 1943L, u'Katina Paxinou', u'For Whom the Bell Tolls')
(8L, 1944L, u'Ethel Barrymore', u'None but the Lonely Heart')
(9L, 1945L, u'Anne Revere', u'National Velvet')
(10L, 1946L, u'Anne Baxter', u"The Razor's Edge")
(11L, 1947L, u'Celeste Holm', u"Gentleman's Agreement")
(12L, 1948L, u'Claire Trevor', u'Key Largo')
(13L, 1949L, u'Mercedes McCambridge', u"All the King's Men")
(14L, 1950L, u'Josephine Hull', u'Harvey')
(15L, 1951L, u'Kim Hunter', u'A Streetcar Named Desire')
(16L, 1952L, u'Gloria Grahame', u'The Bad and the Beautiful')
(17L, 1953L, u'Donna Reed', u'From Here to Eternity')
(18L, 1954L, u'Eva Marie Saint', u'On the Waterfront')
(19L, 19

In [52]:
# Return a list of actors who have won more than 1 Oscar. In descending order
actor_mostWins = engine.execute("SELECT actor, COUNT(actor) AS total FROM best_actor GROUP BY actor HAVING total > 1 ORDER BY total DESC")
for x in actor_mostWins:
    print x

(u'Daniel Day-Lewis', 3L)
(u'Emil Jannings', 2L)
(u'Spencer Tracy', 2L)
(u'Fredric March', 2L)
(u'Gary Cooper', 2L)
(u'Marlon Brando', 2L)
(u'Dustin Hoffman', 2L)
(u'Tom Hanks', 2L)
(u'Jack Nicholson', 2L)
(u'Sean Penn', 2L)


In [53]:
# Return a list of actresses who have won more than 1 Oscar. In descending order
actress_mostWins = engine.execute("SELECT actress, COUNT(actress) AS total FROM best_actress GROUP BY actress HAVING total > 1 ORDER BY total DESC")
for x in actress_mostWins:
    print x

(u'Katharine Hepburn', 4L)
(u'Janet Gaynor', 3L)
(u'Luise Rainer', 2L)
(u'Bette Davis', 2L)
(u'Olivia de Havilland', 2L)
(u'Vivien Leigh', 2L)
(u'Ingrid Bergman', 2L)
(u'Elizabeth Taylor', 2L)
(u'Glenda Jackson', 2L)
(u'Jane Fonda', 2L)
(u'Sally Field', 2L)
(u'Jodie Foster', 2L)
(u'Hilary Swank', 2L)
(u'Meryl Streep', 2L)
(u'Frances McDormand', 2L)


In [54]:
# Return a list of directors who have won more than 1 Oscar. In descending order
director_mostWins = engine.execute("SELECT director, COUNT(director) AS total FROM best_director GROUP BY director HAVING total > 1 ORDER BY total DESC")
for x in director_mostWins:
    print x

(u'John Ford', 4L)
(u'Frank Capra', 3L)
(u'William Wyler', 3L)
(u'Frank Lloyd', 2L)
(u'Leo McCarey', 2L)
(u'Joseph L. Mankiewicz', 2L)
(u'Elia Kazan', 2L)
(u'George Stevens', 2L)
(u'Billy Wilder', 2L)
(u'David Lean', 2L)
(u'Fred Zinnemann', 2L)
(u'Milo\u0161 Forman', 2L)
(u'Oliver Stone', 2L)
(u'Steven Spielberg', 2L)
(u'Clint Eastwood', 2L)
(u'Ang Lee', 2L)
(u'Alejandro G. I\xf1\xe1rritu', 2L)


In [55]:
# Return a list of supporting actors who have won more than 1 Oscar. In descending order
supActor_mostWins = engine.execute("SELECT actor, COUNT(actor) AS total FROM best_supporting_actor GROUP BY actor HAVING total > 1 ORDER BY total DESC")
for x in supActor_mostWins:
    print x

(u'Walter Brennan', 3L)
(u'Anthony Quinn', 2L)
(u'Peter Ustinov', 2L)
(u'Jason Robards', 2L)
(u'Melvyn Douglas', 2L)
(u'Michael Caine', 2L)
(u'Christoph Waltz', 2L)


In [56]:
# Return a list of supporting actresses who have won more than 1 Oscar. In descending order
actress_mostWins = engine.execute("SELECT actress, COUNT(actress) AS total FROM best_supporting_actress GROUP BY actress HAVING total > 1 ORDER BY total DESC")
for x in actress_mostWins:
    print x

(u'Shelley Winters', 2L)
(u'Dianne Wiest', 2L)


In [57]:
# Return a list of best picture winning directors/companies in descending order of total wins
# This query is more complex because the nominee column can have multiple names in it. We are interested in the first name as
# that is usually the director. The names can be seperated by commas or 'and' so we use search for the position of the first comma.
# If a comma is found then use that as the maximum index for the substring. If not then search for the position of ' and ' and
# use that as the maximum index. Return the substring
picture_mostWins = engine.execute("SELECT \
IF (POSITION(',' IN nominee) > 0 , SUBSTRING(nominee, 1, POSITION(',' IN nominee) -1) , \
IF(POSITION(' and ' IN nominee) > 0, SUBSTRING(nominee, 1, POSITION(' and ' IN nominee)), nominee) \
) AS nominee, COUNT(nominee) AS total \
FROM best_picture GROUP BY nominee ORDER BY total DESC")
for x in picture_mostWins:
    print x

(u'Metro-Goldwyn-Mayer', 5L)
(u'20th Century-Fox', 3L)
(u'Sam Spiegel', 3L)
(u'Columbia', 2L)
(u'Selznick International Pictures', 2L)
(u'Warner Bros.', 2L)
(u'Paramount', 2L)
(u'Arthur Freed', 2L)
(u'Robert Wise', 2L)
(u'Saul Zaentz', 2L)
(u'Paramount Famous Lasky', 1L)
(u'Universal', 1L)
(u'RKO Radio', 1L)
(u'Fox', 1L)
(u'Samuel Goldwyn Productions', 1L)
(u'J. Arthur Rank-Two Cities Films', 1L)
(u'Robert Rossen Productions', 1L)
(u'Cecil B. DeMille', 1L)
(u'Buddy Adler', 1L)
(u'Harold Hecht', 1L)
(u'Michael Todd', 1L)
(u'Sam Zimbalist', 1L)
(u'Billy Wilder', 1L)
(u'Tony Richardson', 1L)
(u'Jack L. Warner', 1L)
(u'Fred Zinnemann', 1L)
(u'Walter Mirisch', 1L)
(u'John Woolf', 1L)
(u'Jerome Hellman', 1L)
(u'Frank McCarthy', 1L)
(u"Philip D'Antoni", 1L)
(u'Albert S. Ruddy', 1L)
(u'Tony Bill', 1L)
(u'Francis Ford Coppola', 1L)
(u'Michael Douglas ', 1L)
(u'Irwin Winkler ', 1L)
(u'Charles H. Joffe', 1L)
(u'Barry Spikings', 1L)
(u'Stanley R. Jaffe', 1L)
(u'Ronald L. Schwary', 1L)
(u'David Put

  cursor.execute(statement, parameters)
