In [1]:
import pandas as pd
import matplotlib as plt
import json
import requests
from pprint import pprint

<h2>Scraping Base Data (Top 100 Grossing Movies)

In [61]:
# Scrape the base data from the-numbers.com (for Top 100 Grossing Movies)
response = requests.get("https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time")
base_df_1 = pd.read_html(response.text)[0]
base_df_2 = pd.read_html(response.text)[1]
base_df = base_df_1.append(base_df_2).reset_index(drop=True)

# Fix cut off titles
base_df = base_df.replace(base_df.iloc[74]['Movie'], 'Pirates of the Caribbean: The Curse of the Black Pearl')
base_df = base_df.replace(base_df.iloc[87]['Movie'], 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe')

base_df = base_df.rename(columns = {"Movie":"title", "Released": "year"})

base_df.head()

Unnamed: 0,Rank,year,title,DomesticBox Office,InternationalBox Office,WorldwideBox Office
0,1,2015,Star Wars Ep. VII: The Force Awakens,"$936,662,225","$1,116,648,995","$2,053,311,220"
1,2,2009,Avatar,"$760,507,625","$2,015,837,654","$2,776,345,279"
2,3,2018,Black Panther,"$700,059,566","$648,300,000","$1,348,359,566"
3,4,2018,Avengers: Infinity War,"$678,815,482","$1,369,988,242","$2,048,803,724"
4,5,1997,Titanic,"$659,363,944","$1,548,844,451","$2,208,208,395"


In [45]:
#Extra API key: cd87cd45
url = 'http://www.omdbapi.com/?apikey=8cc26962&r=json'

'http://www.omdbapi.com/?apikey=8cc26962&r=json'

<h2>Scraping IMDB Data from OMDB API

In [87]:
imdb_dicts=[]

for index, row in base_df.iterrows():

    title = row['title']
    year = row['year']
    
    title = title.replace('â', '\'')
    
    # Change title formatting to IMDB formatting for special cases
    if 'Star Wars' in title:
        title = title.replace('Ep.', 'Episode').replace(':', ' -')
        if 'Episode VII ' in title:
            title = 'Star Wars: Episode VII'
    if 'Harry Potter' in title:
        if 'II' in title:
            title = title.replace('II', '2')
        elif 'I' in title:
            title = title.replace('I', '1')
        
    try: 
        # Prepare URL
        query_url = f"{url}&t={title}&y={year}"
        
        # Make the call to the API
        omdb_dict = requests.get(query_url).json()
        
        # Append data to the list of dictionaries
        imdb_dicts.append({"title": row['title'],
                          "year": omdb_dict['Year'],
                          "genre": omdb_dict['Genre'],
                          "director": omdb_dict['Director'],
                          "main_cast": omdb_dict['Actors'],
                          "writer": omdb_dict['Writer'],
                          "language": omdb_dict['Language'],
                          "country": omdb_dict['Country'],
                          "awards": omdb_dict['Awards'],
                          "imdbRating": omdb_dict['imdbRating'],
                          "imdbVotes": omdb_dict['imdbVotes'],
                          "box_office": omdb_dict['BoxOffice'],
                          "studio": omdb_dict['Production']})
    except:
        print(title, year)
    
imdb_df = pd.DataFrame(imdb_dicts)
imdb_df = imdb_df[["title", "year", "genre", "director", "main_cast", "writer", \
                    "language", "country", "awards", "imdbRating", "imdbVotes", \
                    "box_office", "studio"]]
imdb_df.head()

Unnamed: 0,title,year,genre,director,main_cast,writer,language,country,awards,imdbRating,imdbVotes,box_office,studio
0,Star Wars Ep. VII: The Force Awakens,2015,Short,,Freddie Kuguru,,English,USA,,,,,
1,Avatar,2009,"Action, Adventure, Fantasy, Sci-Fi",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron,"English, Spanish","UK, USA",Won 3 Oscars. Another 85 wins & 128 nominations.,7.8,1013715.0,"$749,700,000",20th Century Fox
2,Black Panther,2018,"Action, Adventure, Sci-Fi",Ryan Coogler,"Chadwick Boseman, Michael B. Jordan, Lupita Ny...","Ryan Coogler, Joe Robert Cole, Stan Lee (based...","Swahili, Nama, English, Xhosa, Korean","USA, South Africa, South Korea, Australia",14 nominations.,7.4,430289.0,"$501,105,037",Marvel Studios
3,Avengers: Infinity War,2018,"Action, Adventure, Fantasy, Sci-Fi","Anthony Russo, Joe Russo","Robert Downey Jr., Chris Hemsworth, Mark Ruffa...","Christopher Markus (screenplay by), Stephen Mc...",English,USA,,8.5,550743.0,"$664,987,816",Walt Disney Pictures
4,Titanic,1997,"Drama, Romance",James Cameron,"Leonardo DiCaprio, Kate Winslet, Billy Zane, K...",James Cameron,"English, Swedish",USA,Won 11 Oscars. Another 111 wins & 77 nominations.,7.8,926635.0,,Paramount Pictures


In [88]:
new_base_df = base_df.copy() # delete later on merge

# Merge with Base Table
imdb_df['year'] = pd.to_numeric(imdb_df['year'], errors='ignore')
new_base_df2 = pd.merge(new_base_df, imdb_df, how='right', on=['title','year'])
new_base_df2

Unnamed: 0,Rank,year,title,DomesticBox Office,InternationalBox Office,WorldwideBox Office,genre,director,main_cast,writer,language,country,awards,imdbRating,imdbVotes,box_office,studio
0,1,2015,Star Wars Ep. VII: The Force Awakens,"$936,662,225","$1,116,648,995","$2,053,311,220",Short,,Freddie Kuguru,,English,USA,,,,,
1,2,2009,Avatar,"$760,507,625","$2,015,837,654","$2,776,345,279","Action, Adventure, Fantasy, Sci-Fi",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron,"English, Spanish","UK, USA",Won 3 Oscars. Another 85 wins & 128 nominations.,7.8,1013715,"$749,700,000",20th Century Fox
2,3,2018,Black Panther,"$700,059,566","$648,300,000","$1,348,359,566","Action, Adventure, Sci-Fi",Ryan Coogler,"Chadwick Boseman, Michael B. Jordan, Lupita Ny...","Ryan Coogler, Joe Robert Cole, Stan Lee (based...","Swahili, Nama, English, Xhosa, Korean","USA, South Africa, South Korea, Australia",14 nominations.,7.4,430289,"$501,105,037",Marvel Studios
3,4,2018,Avengers: Infinity War,"$678,815,482","$1,369,988,242","$2,048,803,724","Action, Adventure, Fantasy, Sci-Fi","Anthony Russo, Joe Russo","Robert Downey Jr., Chris Hemsworth, Mark Ruffa...","Christopher Markus (screenplay by), Stephen Mc...",English,USA,,8.5,550743,"$664,987,816",Walt Disney Pictures
4,5,1997,Titanic,"$659,363,944","$1,548,844,451","$2,208,208,395","Drama, Romance",James Cameron,"Leonardo DiCaprio, Kate Winslet, Billy Zane, K...",James Cameron,"English, Swedish",USA,Won 11 Oscars. Another 111 wins & 77 nominations.,7.8,926635,,Paramount Pictures
5,6,2015,Jurassic World,"$652,270,625","$996,584,239","$1,648,854,864","Action, Adventure, Sci-Fi",Colin Trevorrow,"Chris Pratt, Bryce Dallas Howard, Irrfan Khan,...","Rick Jaffa (screenplay by), Amanda Silver (scr...",English,USA,14 wins & 57 nominations.,7.0,525152,"$528,757,749",Universal Pictures
6,7,2012,The Avengers,"$623,279,547","$894,656,350","$1,517,935,897","Action, Adventure, Sci-Fi",Joss Whedon,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...","Joss Whedon (screenplay), Zak Penn (story), Jo...","English, Russian, Hindi",USA,Nominated for 1 Oscar. Another 38 wins & 79 no...,8.1,1136922,"$623,279,547",Walt Disney Pictures
7,8,2017,Star Wars Ep. VIII: The Last Jedi,"$620,181,382","$696,583,402","$1,316,764,784","Action, Adventure, Fantasy, Sci-Fi",Rian Johnson,"Mark Hamill, Carrie Fisher, Adam Driver, Daisy...","Rian Johnson, George Lucas (based on character...",English,USA,Nominated for 4 Oscars. Another 12 wins & 71 n...,7.2,438214,"$619,117,636",Walt Disney Pictures
8,9,2018,Incredibles 2,"$608,581,744","$634,000,000","$1,242,581,744","Animation, Action, Adventure, Comedy, Family, ...",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",Brad Bird,English,USA,,7.8,159661,,Disney/Pixar
9,10,2008,The Dark Knight,"$533,720,947","$468,275,260","$1,001,996,207","Action, Crime, Drama, Thriller",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","Jonathan Nolan (screenplay), Christopher Nolan...","English, Mandarin","USA, UK",Won 2 Oscars. Another 152 wins & 155 nominations.,9.0,2005606,"$533,316,061",Warner Bros. Pictures/Legendary


<h2>Insert Into MongoDB

In [56]:
import pymongo
conn = "mongodb://127.0.0.1:27017"
client = pymongo.MongoClient(conn)
db = client["movies"]
db.imdb.insert_many(imdb_dicts)
imdb_data = db.imdb.find()
for data in imdb_data:
    pprint(data)

{'_id': ObjectId('5c42a38eb520d90c43bccf0e'),
 'awards': 'N/A',
 'box_office': 'N/A',
 'country': 'USA',
 'director': 'N/A',
 'genre': 'Short',
 'imdbRating': 'N/A',
 'imdbVotes': 'N/A',
 'language': 'English',
 'main_cast': 'Freddie Kuguru',
 'studio': 'N/A',
 'title': 'Star Wars: Episode VII - Toys',
 'writer': 'N/A',
 'year': '2015'}
{'_id': ObjectId('5c42a38eb520d90c43bccf0f'),
 'awards': 'Won 3 Oscars. Another 85 wins & 128 nominations.',
 'box_office': '$749,700,000',
 'country': 'UK, USA',
 'director': 'James Cameron',
 'genre': 'Action, Adventure, Fantasy, Sci-Fi',
 'imdbRating': '7.8',
 'imdbVotes': '1,013,715',
 'language': 'English, Spanish',
 'main_cast': 'Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang',
 'studio': '20th Century Fox',
 'title': 'Avatar',
 'writer': 'James Cameron',
 'year': '2009'}
{'_id': ObjectId('5c42a38eb520d90c43bccf10'),
 'awards': '14 nominations.',
 'box_office': '$501,105,037',
 'country': 'USA, South Africa, South Korea, Australia',


 'language': 'English',
 'main_cast': 'Daniel Radcliffe, Michael Gambon, Dave Legeno, Elarica Johnson',
 'studio': 'Warner Bros. Pictures',
 'title': 'Harry Potter and the Half-Blood Prince',
 'writer': 'Steve Kloves (screenplay), J.K. Rowling (novel)',
 'year': '2009'}
{'_id': ObjectId('5c42a38eb520d90c43bccf5c'),
 'awards': '22 wins & 34 nominations.',
 'box_office': '$300,523,113',
 'country': 'USA',
 'director': 'David Slade',
 'genre': 'Adventure, Drama, Fantasy, Romance',
 'imdbRating': '5.0',
 'imdbVotes': '207,159',
 'language': 'English',
 'main_cast': 'Xavier Samuel, Kristen Stewart, Robert Pattinson, Billy Burke',
 'studio': 'Summit Entertainment',
 'title': 'The Twilight Saga: Eclipse',
 'writer': 'Melissa Rosenberg (screenplay), Stephenie Meyer (novel)',
 'year': '2010'}
{'_id': ObjectId('5c42a38eb520d90c43bccf5d'),
 'awards': '20 wins & 24 nominations.',
 'box_office': '$296,593,070',
 'country': 'USA',
 'director': 'Chris Weitz',
 'genre': 'Adventure, Drama, Fantasy, Rom