In [1]:
import pandas as pd
import json
import requests
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup

The following cell is the code that scrapes Wikipedia for the movies that were nominated and won the Oscar for Best Picture. From this, we created the DataFrame, bestpicture that contains a movie's title, year, producers, and whether or not it won the award.

In [2]:
resp = requests.get("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for j in range(0, 10):
    table = tables[j]
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        if (len(cells) == 1):
            year = cells[0].text.split("(")[0].split("/")[0]
        if (len(cells) == 2):
            title = cells[0].find("a").text
            winner = False
            if rows[i].has_attr("style"):
                if rows[i]["style"].split(":")[1] == "#FAEB86":
                    winner = True
            producers = cells[1].text.replace("\n", "").replace("and ", "").split(",")

            rowsList.append({
                "Title": title,
                "producers":producers,
                "winner":winner,
                "year": year
            })
bestpicture = pd.DataFrame(rowsList)
bestpicture.head()    

Unnamed: 0,Title,producers,winner,year
0,Wings,[Paramount Famous Lasky],True,1927
1,The Racket,[The Caddo Company],False,1927
2,7th Heaven,[Fox],False,1927
3,The Broadway Melody,[Metro-Goldwyn-Mayer],True,1928
4,Alibi,[Feature Productions],False,1928


These next cells scrape Wikipedia for information about award shows that happen prior to the Oscars. The DataFrames that result include the title, year, and a column named after the award show, and whether or not the movie won.

In [3]:
resp = requests.get("https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for table in tables:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        if (len(cells) == 1):
            year = cells[0].text.split("(")[0].split("/")[0]
        else:
            if (len(cells) == 5):
                title = cells[1].find("a").text
                winner = True
            if (len(cells) == 4):
                title = cells[0].find("a").text
                winner = False
                
            rowsList.append({
                "Title": title,
                "BAFTA":winner,
                "year": year
            })
bafta = pd.DataFrame(rowsList)
bafta.head()

Unnamed: 0,BAFTA,Title,year
0,True,The Best Years of Our Lives,1947
1,True,Hamlet,1948
2,False,Crossfire,1948
3,False,The Fallen Idol,1948
4,False,Monsieur Vincent,1948


In [4]:
resp = requests.get("https://en.wikipedia.org/wiki/National_Board_of_Review_Award_for_Best_Film")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for table in tables:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        title = cells[1].text.replace(" ‡", "").replace(" §", "").replace(" †", "")
        year = cells[0].text
                
        rowsList.append({
            "Title": title,
            "NBRA": True,
            "year": year
        })
nbra = pd.DataFrame(rowsList)

In [5]:
resp = requests.get("https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for table in tables:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        if (len(cells) == 4):
            year = cells[0].text
            title = cells[1].find("a").text
            winner = True
        if (len(cells) == 3):
            title = cells[0].find("a").text
            winner = False
                
        rowsList.append({
            "Title": title,
            "GOLDENGLOBE":winner,
            "year": year
        })
goldenglobe = pd.DataFrame(rowsList)

In [6]:
resp = requests.get("https://en.wikipedia.org/wiki/Satellite_Award_for_Best_Film#Drama_%281996%E2%80%932009,_2018%29")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for table in tables:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        if (len(cells) == 1):
            year = cells[0].text.split("\n")[0]
        else:
            if (len(cells) == 3):
                year = cells[0].text.split("\n")[0]
                title = cells[1].find("a").text
                winner = True
            if (len(cells) == 2):
                title = cells[0].find("a").text
                winner = False

            rowsList.append({
                "Title": title,
                "SATELLITE": winner,
                "year": year
            })
satellite = pd.DataFrame(rowsList)


In [7]:
resp = requests.get("https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_%E2%80%93_Feature_Film")
soup = BeautifulSoup(resp.content, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
rowsList = []
year = ""
for table in tables:
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")
    for i in range(1, len(rows)):
        cells = rows[i].find_all("td")
        if (len(cells) == 4):
            year = cells[0].text.split("(")[0]
            title = cells[2].find("a").text
            winner = True
        if (len(cells) == 2):
            title = cells[1].find("a").text
            winner = False

        rowsList.append({
            "Title": title,
            "DGA": winner,
            "year": year
        })
dga = pd.DataFrame(rowsList)

The following cells read in the CSVs movies.csv and tags.csv. These files contain information about movies and the tags that users gave them. We merged these 2 DataFrames on its movieId to get a DataFrame containing the movieId, title, and user-given tag.

In [8]:
movies = pd.read_csv("./movies.csv")
tags = pd.read_csv("./tags.csv")
movie_tags = movies.merge(tags, on=["movieId"])
movie_tags["Title"] = movie_tags.title.str.split(" ")
movie_tags["Title"] = movie_tags.Title.str[:-1]
movie_tags["Title"] = movie_tags["Title"].str.join(' ')
movie_tags = movie_tags.drop(["genres", "userId", "timestamp", "title"], axis=1)
movie_tags.head()

Unnamed: 0,movieId,tag,Title
0,1,pixar,Toy Story
1,1,pixar,Toy Story
2,1,fun,Toy Story
3,2,fantasy,Jumanji
4,2,magic board game,Jumanji


Since there are multiple rows for each movie, we had to join the rows and separate the tags by commas.

In [9]:
movie_tags_joined = pd.DataFrame(movie_tags.groupby(movie_tags['Title']).tag.apply(lambda a: ", ".join(a)))
movie_tags_joined = movie_tags_joined.reset_index()

We are using an API that queries data from IMDB. With a for loop, we are going through the movie titles we have in the DataFrams, bestpicture, to create the request urls. We now have a new DataFrame, imdb, that contains information about each movie.

In [10]:
dfs = []
for name in bestpicture["Title"]:
    name = name.replace(" ", "+")
    resp = requests.get("http://www.omdbapi.com/?t=" + name + "&plot=full&apikey=8de699a6")
    movie_json = resp.json()
    movie_df = json_normalize(movie_json)
    dfs.append(movie_df)

In [11]:
imdb = pd.concat(dfs, ignore_index=True, sort=True)

For all the DataFrames we collected, we merged them and created the DataFrame, oscar_movies. We also dropped some unnecessary columns. We exported this DataFrame into a CSV to use in other files.

In [12]:
oscar_movies = pd.concat([bestpicture, imdb.drop(["Title"], axis=1)], axis=1)
oscar_movies = oscar_movies.drop(["DVD", "Error", "Poster", "Type", "Website", 
                                  "totalSeasons", "imdbID", "Awards", "Actors",
                                 "Writer", "Released", "Response", "Metascore", 
                                  "Language"], axis=1)

oscar_movies = oscar_movies.merge(bafta.drop(["year"], axis=1), on=["Title"], how="left")
oscar_movies = oscar_movies.merge(nbra.drop(["year"], axis=1), on=["Title"], how="left")
oscar_movies = oscar_movies.merge(goldenglobe.drop(["year"], axis=1), on=["Title"], how="left")
oscar_movies = oscar_movies.merge(satellite.drop(["year"], axis=1), on=["Title"], how="left")
oscar_movies = oscar_movies.merge(dga.drop(["year"], axis=1), on=["Title"], how="left")
oscar_movies = oscar_movies.merge(movie_tags_joined, on=["Title"], how="left")
oscar_movies.BAFTA.fillna(False, inplace=True)
oscar_movies.NBRA.fillna(False, inplace=True)
oscar_movies.GOLDENGLOBE.fillna(False, inplace=True)
oscar_movies.SATELLITE.fillna(False, inplace=True)
oscar_movies.DGA.fillna(False, inplace=True)
oscar_movies.tag.fillna("none", inplace=True)
oscar_movies.to_csv(path_or_buf="./oscar_movies.csv", index=False)
oscar_movies.tail()

Unnamed: 0,Title,producers,winner,year,BoxOffice,Country,Director,Genre,Plot,Production,...,Runtime,Year,imdbRating,imdbVotes,BAFTA,NBRA,GOLDENGLOBE,SATELLITE,DGA,tag
553,The Favourite,"[Ceci Dempsey, Ed Guiney, Lee Magiday Yorgos...",False,2018,,"Ireland, UK, USA",Yorgos Lanthimos,"Biography, Comedy, Drama, History","In early 18th century England, a frail Queen A...","Fox Searchlight Pictures, Film4 and Waypoint",...,119 min,2018,7.8,64262,False,False,False,False,False,none
554,Roma,[Gabriela Rodríguez Alfonso Cuarón],False,2018,,"Mexico, USA",Alfonso Cuarón,Drama,A year in the life of a middle-class family's ...,,...,135 min,2018,7.9,89226,True,False,False,False,True,none
555,A Star Is Born,"[Bill Gerber, Bradley Cooper Lynette Howell T...",False,2018,,USA,Bradley Cooper,"Drama, Music, Romance","Jackson Maine (Cooper), a country music star o...",Warner Bros. Pictures,...,136 min,2018,7.9,186913,False,False,False,False,False,none
556,A Star Is Born,"[Bill Gerber, Bradley Cooper Lynette Howell T...",False,2018,,USA,Bradley Cooper,"Drama, Music, Romance","Jackson Maine (Cooper), a country music star o...",Warner Bros. Pictures,...,136 min,2018,7.9,186913,False,False,False,False,False,none
557,Vice,"[Dede Gardner, Jeremy Kleiner, Adam McKay Ke...",False,2018,,USA,Adam McKay,"Biography, Comedy, Drama","The story of Dick Cheney, an unassuming bureau...",Annapurna Pictures,...,132 min,2018,7.2,41254,False,False,False,False,False,none


In [13]:
url = "https://www.esquire.com/entertainment/movies/g24561951/best-movies-of-2019/"
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")  
soup   
titles = soup.find_all("span", {"class": "listicle-slide-hed-text"})
movies_2019 = pd.DataFrame()
for title in titles:
    name = title.text.replace(" ", "+")
    resp = requests.get("http://www.omdbapi.com/?t=" + name + "&plot=full&apikey=8de699a6")
    movie_json = resp.json()
    movie_df = json_normalize(movie_json)
    movies_2019 = movies_2019.append(movie_df)
    
movies_2019 = movies_2019.drop(["DVD", "Poster", "Type", "Website", 
                                  "totalSeasons", "imdbID", "Awards", "Actors",
                                 "Writer", "Released", "Response", "Metascore",
                                  "Language"], axis=1)
movies_2019 = movies_2019[movies_2019['Error'] != "Movie not found!"]
movies_2019.to_csv(path_or_buf="./movies_2019.csv", index=False)
movies_2019.tail()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,BoxOffice,Country,Director,Error,Genre,Plot,Production,Rated,Ratings,Runtime,Title,Year,imdbRating,imdbVotes
0,"$169,137",India,Shirish Kunder,,"Comedy, Family, Sci-Fi",In 1947 when the maps of India and Pakistan we...,UTV Communications,Not Rated,"[{'Source': 'Internet Movie Database', 'Value'...",104 min,Joker,2012,2.6,4491.0
0,,USA,John Crowley,,Drama,A boy in New York is taken in by a wealthy Upp...,Warner Bros. Pictures,,[],,The Goldfinch,2019,,
0,,USA,Marielle Heller,,"Biography, Drama",Two-time Oscar®-winner Tom Hanks portrays Mist...,Sony Pictures,,[],,A Beautiful Day in the Neighborhood,2019,,
0,,UK,John Guillermin,,"Crime, Drama, Mystery","Based on the Agatha Christie novel, our favour...",EMI Films Ltd.,PG,"[{'Source': 'Internet Movie Database', 'Value'...",140 min,Death on the Nile,1978,7.3,24401.0
0,,USA,J.J. Abrams,,"Action, Adventure, Fantasy, Sci-Fi",The final chapter of the saga from a galaxy fa...,Lucasfilm,,[],,Star Wars: Episode IX,2019,,
