In [1]:
import pandas as pd
import numpy as np
import re
import csv

from bs4 import BeautifulSoup
import requests
import time, os 
import random
from fake_useragent import UserAgent

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

ua = UserAgent()


In [2]:
movies = pd.read_csv('./movies_2021_2017.csv')

In [3]:
movies.shape

(4452, 12)

In [4]:
movies.tail()

Unnamed: 0.1,Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
4447,4447,WHAT IF? A (Fan-Made) 'Life is Strange' Story,tt7732486,https://www.imdb.com/title/tt7732486/?ref_=adv...,[],127,"Drama, Mystery",5.1,204,[],See full summary,2017
4448,4448,Created Equal,tt5873100,https://www.imdb.com/title/tt5873100/?ref_=adv...,TV-PG,91,"Drama, Thriller",5.5,204,[],Bill Duke,2017
4449,4449,Kate Can't Swim,tt5752904,https://www.imdb.com/title/tt5752904/?ref_=adv...,[],90,Drama,5.5,201,[],Josh Helman,2017
4450,4450,Blue Hollywood,tt5588118,https://www.imdb.com/title/tt5588118/?ref_=adv...,[],82,"Comedy, Drama, Romance",7.3,200,[],See full summary,2017
4451,4451,A Closer Walk with Thee,tt4943620,https://www.imdb.com/title/tt4943620/?ref_=adv...,Unrated,89,"Drama, Horror",3.8,200,[],John C. Clark,2017


In [5]:
movies = movies.drop(['Unnamed: 0'], axis=1) 
movies.head()

Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
0,Spider-Man: No Way Home,tt10872600,https://www.imdb.com/title/tt10872600/?ref_=ad...,PG-13,148,"Action, Adventure, Fantasy",8.3,739086,71,Jon Watts,2021
1,Dune,tt1160419,https://www.imdb.com/title/tt1160419/?ref_=adv...,PG-13,155,"Action, Adventure, Drama",8.0,621069,74,Denis Villeneuve,2021
2,Don't Look Up,tt11286314,https://www.imdb.com/title/tt11286314/?ref_=ad...,R,138,"Comedy, Drama, Sci-Fi",7.2,529981,49,Adam McKay,2021
3,No Time to Die,tt2382320,https://www.imdb.com/title/tt2382320/?ref_=adv...,PG-13,163,"Action, Adventure, Thriller",7.3,393930,68,Cary Joji Fukunaga,2021
4,Zack Snyder's Justice League,tt12361974,https://www.imdb.com/title/tt12361974/?ref_=ad...,R,242,"Action, Adventure, Fantasy",8.0,393504,54,Zack Snyder,2021


In [2]:
def scrape_movie_details(df):
    
    movies_details_dict = {}  #initializing an empty dictionary to fill with information
    
    for i in df.index:
        title = df['title'][i]
        movie_id = df['movie_id'][i]
        movie_imdb_link = df['movie_imdb_link'][i]

        user_agent = {'User-agent': ua.random} # generates a random user agent.
        response = requests.get(movie_imdb_link,headers = user_agent).text
        soup = BeautifulSoup(response, "html.parser")
    
        #top writer
        try:
            top_writer = soup.select('div.sc-fa02f843-2.dwQKsL > div > div > ul > li:nth-child(2) > div > ul > li:nth-child(1) > a')[0].get_text()
        except:
            top_writer = []
    
        # top listed three actors/actresses 

        #star #1
        try:
            top_star_1 = soup.select('div.sc-fa02f843-2.dwQKsL > div > div > ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
        except:
            top_star_1 = []
        
        # star #2
        try:
            top_star_2 = soup.select('div.sc-fa02f843-2.dwQKsL > div > div > ul > li:nth-child(3) > div > ul > li:nth-child(2) > a')[0].get_text()
        except:
            top_star_2 = []
        
        # star #3
        try:
            top_star_3 = soup.select('div.sc-fa02f843-2.dwQKsL > div > div > ul > li:nth-child(3) > div > ul > li:nth-child(3) > a')[0].get_text()
        except:
            top_star_3 = []
        
        # number of user reviews
        try:
            num_user_reviews = soup.find_all(class_="label")[0].findPrevious().text
        except:
            num_user_reviews = 0
        
        # number of critic reviews
        try:
            num_critic_reviews = soup.find(text=re.compile('Critic reviews')).findPrevious().findPrevious().text
        except:
            num_critic_reviews = 0
        
        # release date
        try:
            release_date = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(1) > div > ul > li > a')[0].get_text().split('(')[0].strip()
        except:
            release_date = []
        
        # release month
        try:
            release_month = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(1) > div > ul > li > a')[0].get_text().split('(')[0].strip().split(' ')[0].strip()
        except: 
            release_month = []
        
        # country of origin
        try:
            country_of_origin = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(2) > div > ul > li > a')[0].get_text()
        except:
            country_of_origin = []
        
        # top production company:
        try:
            top_production_company = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(7) > div > ul > li:nth-child(1) > a')[0].get_text()
        except:
            top_production_company = []
        
        # Budget in USD: Production budget refers to the cost to make the movie and it does not include marketing or other expenditures.
        try:
            budget_in_usd = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(1) > div > ul > li > span')[0].get_text().split('(')[0].strip().replace('$',"").replace(',','')
        except:
            budget_in_usd = []
        
        # Opening weekend US & Canada (in USD): Weekend box office charts show gross receipts for a given weekend, which is Friday through Sunday unless otherwise noted.
        try:
            opening_weekend_us_can_in_usd = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(3) > div > ul > li:nth-child(1) > span')[0].get_text().replace('$',"").replace(',','')
        except: 
            opening_weekend_us_can_in_usd = []
        
        # Gross US & Canada in USD (Gross refers to gross earnings in U.S. dollars.)
        try:
            gross_us_can_in_usd = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(2) > div > ul > li > span')[0].get_text().replace('$',"").replace(',','')
        except:
            gross_us_can_in_usd = []
        
        # Gross worldwide in USD
        try: 
            gross_worldwide_in_usd = soup.select('div.sc-f65f65be-0.ktSkVi > ul > li:nth-child(4) > div > ul > li > span')[0].get_text().replace('$',"").replace(',','')
        except:
            gross_worldwide_in_usd = []
        
        movies_details_dict[df.index[i]] = [title]+[movie_id]+[movie_imdb_link]+[top_writer]+[top_star_1]+[top_star_2]+\
        [top_star_3]+[num_user_reviews]+[num_critic_reviews]+[release_date]+[release_month]+[country_of_origin]+\
        [top_production_company]+[budget_in_usd]+[opening_weekend_us_can_in_usd]+[gross_us_can_in_usd]+\
        [gross_worldwide_in_usd]
    
        time.sleep(.5+2*random.random()) # random time between requests.

    # define headers
    headers = ['title','movie_id','movie_imdb_link', 'top_writer','top_star_1','top_star_2','top_star_3',
                  'num_user_reviews', 'num_critic_reviews', 'release_date','release_month','country_of_origin',
                  'top_production_company','budget_in_usd', 'opening_weekend_us_can_in_usd','gross_us_can_in_usd',
                  'gross_worldwide_in_usd' ]
    
    # turn into a dataframe    
    movie_details_dataframe = pd.DataFrame(movies_details_dict).T
    movie_details_dataframe.columns = headers
    
    #save the dataframe
    #movie_details_dataframe.to_csv('movie_details_2022.csv') # CHANGE THE YEAR ACC TO YOUR INTEREST
    
    return movie_details_dataframe


In [7]:
movie_details = scrap_movie_details(movies)


In [8]:
movie_details.shape

(4452, 17)

In [10]:
movie_details.head(10)

Unnamed: 0,title,movie_id,movie_imdb_link,top_writer,top_star_1,top_star_2,top_star_3,num_user_reviews,num_critic_reviews,release_date,release_month,country_of_origin,top_production_company,budget_in_usd,opening_weekend_us_can_in_usd,gross_us_can_in_usd,gross_worldwide_in_usd
0,Spider-Man: No Way Home,tt10872600,https://www.imdb.com/title/tt10872600/?ref_=ad...,Chris McKenna,Tom Holland,Zendaya,Benedict Cumberbatch,6.1K,412,"December 17, 2021",December,United States,Columbia Pictures,200000000,260138569,814115070,1916278650
1,Dune,tt1160419,https://www.imdb.com/title/tt1160419/?ref_=adv...,Jon Spaihts,Timothée Chalamet,Rebecca Ferguson,Zendaya,5.6K,509,"October 22, 2021",October,Canada,Warner Bros.,165000000,41011174,108327830,401847900
2,Don't Look Up,tt11286314,https://www.imdb.com/title/tt11286314/?ref_=ad...,Adam McKay,Leonardo DiCaprio,Jennifer Lawrence,Meryl Streep,4.6K,309,"December 24, 2021",December,United States,Hyperobject Industries,75000000,[],791863,2.39 : 1
3,No Time to Die,tt2382320,https://www.imdb.com/title/tt2382320/?ref_=adv...,Neal Purvis,Daniel Craig,Ana de Armas,Rami Malek,4.1K,449,"October 8, 2021",October,United Kingdom,Metro-Goldwyn-Mayer (MGM),250000000,55225007,160891007,774153007
4,Zack Snyder's Justice League,tt12361974,https://www.imdb.com/title/tt12361974/?ref_=ad...,Jerry Siegel,Henry Cavill,Ben Affleck,Gal Gadot,8K,308,"March 18, 2021",March,United States,Atlas Entertainment,300000000,[],[],1.33 : 1
5,Shang-Chi and the Legend of the Ten Rings,tt9376612,https://www.imdb.com/title/tt9376612/?ref_=adv...,Dave Callaham,Simu Liu,Awkwafina,Tony Chiu-Wai Leung,2.4K,338,"September 3, 2021",September,United States,Walt Disney Pictures,200000000,75388688,224543292,432243292
6,Black Widow,tt3480822,https://www.imdb.com/title/tt3480822/?ref_=adv...,Eric Pearson,Scarlett Johansson,Florence Pugh,David Harbour,3.1K,390,"July 9, 2021",July,United States,Marvel Studios,200000000,80366312,183651655,379751655
7,Free Guy,tt6264654,https://www.imdb.com/title/tt6264654/?ref_=adv...,Matt Lieberman,Ryan Reynolds,Jodie Comer,Taika Waititi,2.1K,287,"August 13, 2021",August,United States,20th Century Studios,120000000,28365416,121626598,331526598
8,The Suicide Squad,tt6334354,https://www.imdb.com/title/tt6334354/?ref_=adv...,James Gunn,Margot Robbie,Idris Elba,John Cena,3.4K,378,"August 5, 2021",August,United States,Warner Bros.,185000000,26205415,55817425,168657565
9,Eternals,tt9032400,https://www.imdb.com/title/tt9032400/?ref_=adv...,Chloé Zhao,Gemma Chan,Richard Madden,Angelina Jolie,3.6K,348,"November 5, 2021",November,United States,Marvel Studios,200000000,71297219,164870234,402064899


# Repeating everything for another dataframe

In [13]:
movies_2016_2015 = pd.read_csv('./movies_2016_2015.csv')

In [14]:
movies_2016_2015 = movies_2016_2015.drop(['Unnamed: 0'], axis=1) 
movies_2016_2015.head()

Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
0,Star Wars: Episode VII - The Force Awakens,tt2488496,https://www.imdb.com/title/tt2488496/?ref_=adv...,PG-13,138,"Action, Adventure, Sci-Fi",7.8,926911,80,J.J. Abrams,2015
1,Avengers: Age of Ultron,tt2395427,https://www.imdb.com/title/tt2395427/?ref_=adv...,PG-13,141,"Action, Adventure, Sci-Fi",7.3,858022,66,Joss Whedon,2015
2,The Martian,tt3659388,https://www.imdb.com/title/tt3659388/?ref_=adv...,PG-13,144,"Adventure, Drama, Sci-Fi",8.0,848340,80,Ridley Scott,2015
3,The Revenant,tt1663202,https://www.imdb.com/title/tt1663202/?ref_=adv...,R,156,"Action, Adventure, Drama",8.0,798584,76,Alejandro G. Iñárritu,2015
4,Inside Out,tt2096673,https://www.imdb.com/title/tt2096673/?ref_=adv...,PG,95,"Animation, Adventure, Comedy",8.2,708522,94,Pete Docter,2015


In [15]:
movie_details_2016_2015 = scrape_movie_details(movies_2016_2015)

In [16]:
movie_details_2016_2015.head(3)

Unnamed: 0,title,movie_id,movie_imdb_link,top_writer,top_star_1,top_star_2,top_star_3,num_user_reviews,num_critic_reviews,release_date,release_month,country_of_origin,top_production_company,budget_in_usd,opening_weekend_us_can_in_usd,gross_us_can_in_usd,gross_worldwide_in_usd
0,Star Wars: Episode VII - The Force Awakens,tt2488496,https://www.imdb.com/title/tt2488496/?ref_=adv...,Lawrence Kasdan,Daisy Ridley,John Boyega,Oscar Isaac,5K,913,"December 18, 2015",December,United States,Lucasfilm,245000000,247966675,936662225,2069521700
1,Avengers: Age of Ultron,tt2395427,https://www.imdb.com/title/tt2395427/?ref_=adv...,Joss Whedon,Robert Downey Jr.,Chris Evans,Mark Ruffalo,1.4K,704,"May 1, 2015",May,United States,Marvel Studios,250000000,191271109,459005868,1402809540
2,The Martian,tt3659388,https://www.imdb.com/title/tt3659388/?ref_=adv...,Drew Goddard,Matt Damon,Jessica Chastain,Kristen Wiig,1.4K,642,"October 2, 2015",October,United States,Twentieth Century Fox,108000000,54308575,228433663,630620818


# Validation data (year 2022)

In [9]:
movies_2022 = pd.read_csv('./movies_2022.csv')

In [10]:
movies_2022.head()

Unnamed: 0.1,Unnamed: 0,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
0,The Batman,tt1877830,https://www.imdb.com/title/tt1877830/?ref_=adv...,PG-13,176,"Action, Crime, Drama",7.9,613958.0,72,Matt Reeves,2022
1,Top Gun: Maverick,tt1745960,https://www.imdb.com/title/tt1745960/?ref_=adv...,PG-13,130,"Action, Drama",8.4,413022.0,78,Joseph Kosinski,2022
2,Doctor Strange in the Multiverse of Madness,tt9419884,https://www.imdb.com/title/tt9419884/?ref_=adv...,PG-13,126,"Action, Adventure, Fantasy",6.9,398982.0,60,Sam Raimi,2022
3,Thor: Love and Thunder,tt10648342,https://www.imdb.com/title/tt10648342/?ref_=ad...,PG-13,118,"Action, Adventure, Comedy",6.4,296865.0,57,Taika Waititi,2022
4,Everything Everywhere All at Once,tt6710474,https://www.imdb.com/title/tt6710474/?ref_=adv...,R,139,"Action, Adventure, Comedy",8.1,228555.0,81,Dan Kwan,2022


In [11]:
movies_2022 = movies_2022.reset_index(drop=True)
movies_2022 = movies_2022.rename(columns = {'Unnamed: 0':'title'})
movies_2022.head()

Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
0,The Batman,tt1877830,https://www.imdb.com/title/tt1877830/?ref_=adv...,PG-13,176,"Action, Crime, Drama",7.9,613958.0,72,Matt Reeves,2022
1,Top Gun: Maverick,tt1745960,https://www.imdb.com/title/tt1745960/?ref_=adv...,PG-13,130,"Action, Drama",8.4,413022.0,78,Joseph Kosinski,2022
2,Doctor Strange in the Multiverse of Madness,tt9419884,https://www.imdb.com/title/tt9419884/?ref_=adv...,PG-13,126,"Action, Adventure, Fantasy",6.9,398982.0,60,Sam Raimi,2022
3,Thor: Love and Thunder,tt10648342,https://www.imdb.com/title/tt10648342/?ref_=ad...,PG-13,118,"Action, Adventure, Comedy",6.4,296865.0,57,Taika Waititi,2022
4,Everything Everywhere All at Once,tt6710474,https://www.imdb.com/title/tt6710474/?ref_=adv...,R,139,"Action, Adventure, Comedy",8.1,228555.0,81,Dan Kwan,2022


In [12]:
movie_details_2022 = scrape_movie_details(movies_2022)

In [13]:
movie_details_2022.head(3)

Unnamed: 0,title,movie_id,movie_imdb_link,top_writer,top_star_1,top_star_2,top_star_3,num_user_reviews,num_critic_reviews,release_date,release_month,country_of_origin,top_production_company,budget_in_usd,opening_weekend_us_can_in_usd,gross_us_can_in_usd,gross_worldwide_in_usd
0,The Batman,tt1877830,https://www.imdb.com/title/tt1877830/?ref_=adv...,Matt Reeves,Robert Pattinson,Zoë Kravitz,Jeffrey Wright,7.7K,520,"March 4, 2022",March,United States,Warner Bros.,200000000,134008624,369345583,770836163
1,Top Gun: Maverick,tt1745960,https://www.imdb.com/title/tt1745960/?ref_=adv...,Jim Cash,Tom Cruise,Jennifer Connelly,Miles Teller,3.7K,386,"May 27, 2022",May,United States,Paramount Pictures,170000000,126707459,716657763,1485757763
2,Doctor Strange in the Multiverse of Madness,tt9419884,https://www.imdb.com/title/tt9419884/?ref_=adv...,Michael Waldron,Benedict Cumberbatch,Elizabeth Olsen,Chiwetel Ejiofor,3.9K,399,"May 6, 2022",May,United States,Marvel Studios,200000000,187420998,411331607,955775804


In [14]:
movie_details_2022.shape

(477, 17)