In [1]:
from bs4 import BeautifulSoup
import requests
import time

In [2]:
# function to get replace multiple different substrings with one 
# particular substring in a given string
def replace_multiple(string, to_replace_list, replacement):
    for to_replace in to_replace_list:
        string = string.replace(to_replace, replacement)
    return string

In [None]:
# get a list of links to movies on box office mojo
# takes in a list of years you want movies from
def retrieve_movie_links(years):
    movie_links = [] #all movie links so far
    links = [] #links from the last page 
    for year in years:
        print("in for loop, year = ", year)
        page = 1
        # enter the loop if it's the first page we're seeing
        # OR enter the loop if links is not empty
        while links or page == 1: 
            print("in while loop, page = ", page)
            #make links list empty
            links = []
            #get the links from current page and add to links list
            html_page = requests.get('https://www.boxofficemojo.com/yearly/chart/?page={}&view=releasedate&view2=domestic&yr={}&p=.htm'.format(page, year)) 

            # Pass the page contents to beautiful soup for pars
            soup = BeautifulSoup(html_page.content, 'html.parser')
            for x in list(soup.find_all('tr')):
                for y in x.findChildren('td'):
                    for z in y.findChildren('a'):
                        if 'movies' in str(z.get('href')):
                            links.append(z.get('href'))
 
            # add set of links to the movie_links list (.extend)
            # set because otherwise there will be duplicates
            movie_links.extend(set(links))
            #increase page count
            page += 1
            print("page is ", page, "links:", not links)
            time.sleep(1)
    return movie_links

# loop through the list (which will have hfrefs, I'm assuming)
# for each link, get the title, genre,

stuff = retrieve_movie_links([2017, 2018, 2019])
stuff

In [None]:
# scrapes each page on box office mojo for the title and budget,
# given a list of links
def scrape_each_movie_page(movie_links):
    movies = []
    for movie in movie_links:
        
        # show progress
        if movie_links.index(movie) % 10 == 0:
            print("progress:", movie_links.index(movie),"/", len(movie_links))
        # gets movie page
        html_page = requests.get('https://www.boxofficemojo.com{}'.format(movie)) 
        
        # Pass the page contents to beautiful soup for parsing
        soup = BeautifulSoup(html_page.content, 'html.parser')
        
        # find all the elements with the tag 'b', which is where all the good info is
        prelim_findings = list(soup.find_all('tr'))
        
        findings = list(soup.find_all('b'))
        
        # holds our info (title & budget) for the movie we're currently looking at
        movie_dict = {}
        
        # replace multiple is a function that replaces multiple substrings with
        # one substring
        try:
            movie_dict['title'] = replace_multiple(str(findings[1]), ['<b>', '</b>', '<br/>'], '')
            if 'PG' in str(findings[8]) or 'R' in str(findings[8]) or 'G' in str(findings[8]) or 'M' in str(findings[8]) or 'Unrated' in str(findings[8]):
                movie_dict['budget'] = replace_multiple(str(findings[9]), ['<b>', '</b>'], '')
            else:
                movie_dict['budget'] = replace_multiple(str(findings[8]), ['<b>', '</b>'], '') 
            movies.append(movie_dict)
        except IndexError:
            print("Index error with", movie)
        
        # sleep btwn each movie so as to not get blocked by BOM
        time.sleep(.5)
        
    return movies

mojo_info = scrape_each_movie_page(stuff)

In [None]:
# import json
# # takes the info from box office mojo we got and 
# # puts it into a json file 
# with open('mojo_data.json', 'w') as fp:
#     json.dump(mojo_info, fp)

In [39]:
import re

# give function a string of money like "$.5 million" and will return 500000
def money_string_to_int(money_string):
    # no commas pls (delete all commas)
    money_string = money_string.replace(',','')
    
    # gets 'million' or 'billion' from string
    money_multiplier = money_string.split(' ')[-1]
    
    # gets the integer from the string
    money_amount = list(map(int, re.findall('\d+', money_string)))
    
    #if the string entered isn't an amount of money (ie "abc" or "n/a")
    if not money_amount:
        #just give it back to them
        return money_string
    elif money_multiplier == 'million':
        return money_amount[0]*1000000
    elif money_multiplier == 'billion':
        return money_amount[0]*100000000
    else:
        # if it is not million or billion, then it's in the thousands, 
        # or less, which is given already in the proper format
        return money_amount[0]

In [18]:
# function for making titles with years in them searchable on OMDB
def find_year_in_title(title):
    # regular expression to find all the years in the OMDB format
    year = re.findall('(201\d)', title)
    # if there's a year, returns a tuple with the year and the 
    # title without the year
    if year:
        return (year[0], title.replace("("+year[0]+")", ""))
    # otherwise it just gives you back the title
    else:
        return title

In [19]:
# get movie titles for OMDB call
import json 
f = open('mojo_data.json')
data = json.load(f)
movie_titles = []
for movie in data:
    movie_titles.append(movie['title'])

data

[{'title': "Tyler Perry's Boo 2! A Madea Halloween", 'budget': '$25 million'},
 {'title': "Daddy's Home 2", 'budget': '$69 million'},
 {'title': 'Everything, Everything', 'budget': '$10 million'},
 {'title': 'Monster Trucks', 'budget': 'N/A'},
 {'title': 'How to be a Latin Lover', 'budget': 'N/A'},
 {'title': 'The Nut Job 2:Nutty by Nature', 'budget': '$40 million'},
 {'title': 'American Assassin', 'budget': '$33 million'},
 {'title': 'Annabelle:Creation', 'budget': '$15 million'},
 {'title': 'Star Wars:The Last Jedi', 'budget': '$317 million'},
 {'title': 'Snatched', 'budget': '$42 million'},
 {'title': 'Baby Driver', 'budget': '$34 million'},
 {'title': 'Resident Evil:The Final Chapter', 'budget': '$40 million'},
 {'title': 'Get Out', 'budget': '$4.5 million'},
 {'title': 'Ferdinand', 'budget': '$111 million'},
 {'title': 'Kong:Skull Island', 'budget': '$185 million'},
 {'title': 'Pitch Perfect 3', 'budget': '$45 million'},
 {'title': 'Rough Night', 'budget': '$20 million'},
 {'title

In [81]:
# DOES NOT WORK YET!!!!!!!!!!!!!
# makes an API call to OMDB with movie titles.
def OMDB_call(titles):
    all_titles = []
    for title in titles: 
        parameters = {'t': {title}}
        response = requests.get('http://www.omdbapi.com/?apikey=a80b8c36', params=parameters)
        data = response.json()
        all_titles.append(data)
        time.sleep(.5)
    return all_titles


In [84]:
OMDB_call(['Head full of honey'])

[{'Title': 'Head Full of Honey',
  'Year': '2014',
  'Rated': 'N/A',
  'Released': '25 Dec 2014',
  'Runtime': '139 min',
  'Genre': 'Comedy, Drama, Family',
  'Director': 'Til Schweiger, Lars Gmehling(co-director)',
  'Writer': 'Hilly Martinek (screenplay), Til Schweiger (screenplay)',
  'Actors': 'Pasquale Aleardi, Marco Boriero, Alexa Brunner, Belinda Busch',
  'Plot': "Before Tilda's parents can put her beloved grandfather in an old people's home due to his progressing Alzheimer disease, she takes him on one last adventure that subliminally threatens to tear her family apart.",
  'Language': 'German, Italian',
  'Country': 'Germany',
  'Awards': '6 wins.',
  'Poster': 'https://m.media-amazon.com/images/M/MV5BMjQwMzQ1MDE4OV5BMl5BanBnXkFtZTgwNjg0ODMyNjE@._V1_SX300.jpg',
  'Ratings': [{'Source': 'Internet Movie Database', 'Value': '6.6/10'}],
  'Metascore': 'N/A',
  'imdbRating': '6.6',
  'imdbVotes': '5,484',
  'imdbID': 'tt3488462',
  'Type': 'movie',
  'DVD': 'N/A',
  'BoxOffice': 

In [75]:
# update the budget column in the rows that're in the db right now
def update_budget(db_name, budget_list_with_titles):
    for movie in budget_list_with_titles:
        # reformat budget to be an integer
        budget = money_string_to_int(movie['budget'])
        
        # now we can update the budget in the table
        if budget != "N/A" and budget != 'Unknown':
            cursor.execute("USE {}".format(db_name))
            cursor.execute("""UPDATE main_movie_table 
                               SET budget = %s
                               WHERE bom_title = %s
                               """,(budget, title))
        cnx.commit()
    return

In [76]:
# connect to AWS
import mysql.connector
import config

cnx = mysql.connector.connect(
        host = config.host,
        user = config.user,
        passwd = config.password)

# Establish cursor
cursor = cnx.cursor()

update_budget("Mod_1_Project", data)

cursor.close()
cnx.close()

Tyler Perry's Boo Two! A Madea Halloween
Daddy's Home Two
Everything, Everything
Monster Trucks
How to be a Latin Lover
The Nut Job Two: Nutty by Nature
American Assassin
Annabelle: Creation
Star Wars: The Last Jedi
Snatched
Baby Driver
Resident Evil: The Final Chapter
Get Out
Ferdinand
Kong: Skull Island
Pitch Perfect Three
Rough Night
Transformers: The Last Knight
Fist Fight
The Dark Tower
American Made
Underworld: Blood Wars
Darkest Hour
The Foreigner
Home Again
A Dog's Purpose
The Hitman's Bodyguard
Justice League
Life 
Rings
The Fate of the Furious
The Mountain Between Us
Murder on the Orient Express 
King Arthur: Legend of the Sword
Smurfs: The Lost Village
Geostorm
Dunkirk
Pirates of the Caribbean: Dead Men Tell No Tales
Valerian and the City of a Thousand Planets
Going in Style 
Power Rangers 
Alien: Covenant
All the Money in the World
A Bad Moms Christmas
Ghost in the Shell 
Wonder Woman
Victoria and Abdul
John Wick: Chapter Two
Downsizing
Molly's Game
Phantom Thread
The Shack

The Settlers 
Just One Drop
Dina
Munna Michael
My Journey Through French Cinema
Napping Princess
Tragedy Girls
Karl Marx City
Like Crazy 
Some Like It Hot (Qing Shung)
Tickling Giants
Chapter &amp; Verse
God of War
Saving Banksy
Worlds Apart
XX 
Manolo: The Boy Who Made Shoes for Lizards
Fabricated City
So B. It
Aida's Secrets
The Teacher
The Paris Opera
Il Boom (2017 re-release)
Maurice (2017 reissue)
Alive and Kicking
Harold and Lillian: A Hollywood Love Story
Super Dark Times
Le Trou (2017 re-release)
78/52: Hitchcock's Shower Scene
A Woman's Life
Heart Blackened
Panique (2017 re-release)
Bolshoi Ballet: Hero of our Time
Keep Quiet
The Death of Louis XIV
I am the Blues
Woodshock
Earth: One Amazing Day
Signature Move
Bluebeard
Beautiful Accident
Spettacolo
Hermia &amp; Helena
The Mayor
Quest 
Sidemen: Long Road to Glory
Extraordinary Mission
The Nile Hilton Incident
One Week and a Day
Unrest
Keep Watching
Bill Nye: Science Guy
Person To Person
Growing Up Smith
Afterimage
Pop Aye
Alon

1991
Superfly
Tully 
Chappaquiddick
Fahrenheit 11/9
Show Dogs
Leave No Trace
The Possession of Hannah Grace
Pandas
Can You Ever Forgive Me?
Sorry to Bother You
God Bless the Broken Road
Boy Erased
Mortal Engines
You Were Never Really Here
Midnight Sun
Once Upon a Deadpool
Hearts Beat Loud
Operation Finale
Bad Times At The El Royale
Kin
Gringo
Sgt. Stubby: An American Hero
Beirut
Hell Fest
Unfriended: Dark Web
Welcome to Marwen
Action Point
Traffik
Hunter Killer
2.0
Indivisible
Sanju
The Trump Prophecy
Pokemon the Movie: The Power of Us
Kedarnath
Baaghi Two
Lean on Pete
My Neighbor Totoro - Studio Ghibli Fest Two018
MET Opera: Aida
Never Look Away
Capernaum
A Private War
Pad Man
DCI Two018: Big, Loud &amp; Live 15
Finding Your Feet
Ponyo (10th Anniversary) - Studio Ghibli Fest Two018
The Miseducation of Cameron Post
MET Opera: La Fanciulla Del West
Dragon Ball Z: Broly - The Legendary Super Saiyan (2018 re-release)
The Front Runner
Along with the Gods: The Last 49 Days
Puzzle
The Dark C

Blood Fest
Chef Flynn
Just a Breath Away
Monrovia, Indiana
Detective K: Secret of the Living Dead
Ismael's Ghosts
The House That Jack Built
How to Talk to Girls at Parties
Memoir of War
Where Hands Touch
Take Point
Mary Shelley
The Captain
Goldstone
Running for Grace
Where is Kyra?
Unstoppable 
Bodied
Yuri!!! on ICE Binge
Woman Walks Ahead
Golden Slumber
In Search of Greatness
Happy Phirr Bhag Jayegi 
Parchi
The Fog (2018 re-release)
Custody
NT Live: Antony &amp; Cleopatra
Champion 
The Gardener
After Auschwitz
NT Live: Cat on a Hot Tin Roof
Bandstand: The Broadway Musical 
Andrei Rublev (re-release)
Have a Nice Day
Making the Five Heartbeats
Lu Over the Wall
NT Live: Julius Caesar
Cynthia
The Boxcar Children: Surprise Island
Helicopter Eela
Tehran Taboo
Godard Mon Amour
HYMN - Sarah Brightman In Concert
En el Septimo Dia (On the Seventh Day)
Hale County This Morning, This Evening
Never Heard
Airpocalypse
Outside In
Viper Club
The Last Suit
Westwood: Punk, Icon, Activist
Beuys
Bolshoi 

Shaft  
Tyler Perry's A Madea Family Funeral
Kalank
Avengers: Endgame
Biggest Little Farm
The Kid Who Would be King
How to Train Your Dragon: The Hidden World
BTS World Tour: Love Yourself in Seoul
Serenity 
The Wizard of Oz 80th Anniversary
The Art of Self-Defense
Us
Late Night
Kinky Boots The Musical 
CatVideoFest Two019
Errol Spence vs. Mikey Garcia
Saving Private Ryan (Fathom Event)
Canal Street
A Brother's Love
Whisper of the Heart (2019 re-release)
Fate/Stay Night: Heaven's Feel - II. Lost Butterfly
Luka Chuppi
Between Me and My Mind
The Chaperone 
Dirty Dancing (Fathom Event)
The Muppet Movie: 40th Anniversary
Bethany Hamilton: Unstoppable
Transit
Cruel Intentions (20th Anniversary)
The Hummingbird Project
Diane
NOAH The Musical
The Cold Blue
Meeting Gorbachev
Perfect Strangers
The Public
I Want to Eat Your Pancreas
Climax
TCM: Steel Magnolias Three0th Anniversary
Asterix: The Secret of the Magic Potion
Bolshoi Ballet: Sleeping Beauty 
Khalid: Free Spirit
The White Storm II
JONI

In [37]:
data

[{'title': "Tyler Perry's Boo 2! A Madea Halloween", 'budget': '$25 million'},
 {'title': "Daddy's Home 2", 'budget': '$69 million'},
 {'title': 'Everything, Everything', 'budget': '$10 million'},
 {'title': 'Monster Trucks', 'budget': 'N/A'},
 {'title': 'How to be a Latin Lover', 'budget': 'N/A'},
 {'title': 'The Nut Job 2:Nutty by Nature', 'budget': '$40 million'},
 {'title': 'American Assassin', 'budget': '$33 million'},
 {'title': 'Annabelle:Creation', 'budget': '$15 million'},
 {'title': 'Star Wars:The Last Jedi', 'budget': '$317 million'},
 {'title': 'Snatched', 'budget': '$42 million'},
 {'title': 'Baby Driver', 'budget': '$34 million'},
 {'title': 'Resident Evil:The Final Chapter', 'budget': '$40 million'},
 {'title': 'Get Out', 'budget': '$4.5 million'},
 {'title': 'Ferdinand', 'budget': '$111 million'},
 {'title': 'Kong:Skull Island', 'budget': '$185 million'},
 {'title': 'Pitch Perfect 3', 'budget': '$45 million'},
 {'title': 'Rough Night', 'budget': '$20 million'},
 {'title