In [2]:
import pandas as pd
import numpy as np
import csv

from bs4 import BeautifulSoup
import requests
import time, os 
import random
from fake_useragent import UserAgent

ua = UserAgent()


In [2]:
def movie_lister(year):
    
    year= str(year)
    movies_dict = {}  #initializing an empty dictionary to fill with information
    base_url = 'https://www.imdb.com/title/'
    url = 'https://www.imdb.com/search/title/?title_type=feature&release_date='+ year +'-01-01,'+ year +'-12-31&countries=us&languages=en&sort=num_votes,desc&count=250&start=&ref_=adv_nxt'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # total number of movies released in that year
    items_yearly = soup.select('div.desc > span:nth-child(1)')[0].get_text().split('f')[1].strip().split(' ')[0].replace(',', '')
    items_yearly=int(items_yearly)
    
    iterations = range(1,items_yearly, 250)
    
    for i in iterations: #goes through each page (250 movies per page)
        start_at= str(i)
        imdb_query = 'https://www.imdb.com/search/title/?title_type=feature&release_date='+ year +'-01-01,'+ year +'-12-31&countries=us&languages=en&sort=num_votes,desc&count=250&start=' + start_at + '&ref_=adv_nxt'
        user_agent = {'User-agent': ua.random} # generates a random user agent.
        page = requests.get(imdb_query,headers = user_agent).text
        soup = BeautifulSoup(page, "html.parser")
        
        votes_of_first_movie_on_a_page = soup.select("div.lister-item-content > p.sort-num_votes-visible > span:nth-child(2)")[0].get_text().replace(",",'')
        votes_of_first_movie_on_a_page = int(votes_of_first_movie_on_a_page)
        
        if votes_of_first_movie_on_a_page < 200:
            break
        else:        
            for i in range(0, 250): #goes through each movie on the page
                movie_name = soup.select("div.lister-item-content > h3 > a")[i].get_text()
                movie_id = soup.select("div.lister-item-content")[i].select("a")[0]['href'].split("/")[2]
                movie_imdb_link = base_url + movie_id + '/?ref_=adv_li_tt' # link to each movies imdb page

                try:
                    certificate= soup.find_all(class_="lister-item-content")[i].find(class_='certificate').text
                except: 
                    certificate = []
     
                try:
                    runtime = soup.find_all(class_="lister-item-content")[i].find(class_='runtime').text.strip().split(" ")[0]
                except:
                    runtime = []
     
                try:
                    genre = soup.select("div.lister-item-content > p:nth-child(2) > span.genre")[i].get_text().replace('\n', '').strip() #genre
                except:
                    genre = []
        
                try: 
                    imdb_rating = soup.select("div.lister-item-content > div > div.inline-block.ratings-imdb-rating > strong")[i].get_text()
                except:
                    imdb_rating = []
        
                try:
                    votes = soup.select("div.lister-item-content > p.sort-num_votes-visible > span:nth-child(2)")[i].get_text().replace(",",'')
                except:
                    votes = []

                try:
                    metascore = soup.find_all(class_="lister-item-content")[i].find(class_='inline-block ratings-metascore').find('span').contents[0].strip()
                except:
                    metascore = []
            
                try:
                    top_director = soup.select("div.lister-item-content")[i].select('a')[12].get_text()
                except:
                    top_director = []
            
                movies_dict[movie_name] = [movie_id]+[movie_imdb_link]+[certificate]+[runtime]+[genre]+[imdb_rating]+[votes]+[metascore]+[top_director]
                
                with open(f'movies_dict_{year}.csv', 'w', newline='') as f: # save the dict in case of an error
                    writer = csv.DictWriter(f, fieldnames=movies_dict.keys())
                    writer.writeheader()
                    writer.writerow(movies_dict)
            
    
            time.sleep(.5+2*random.random()) # random time between requests.

    # define headers
    headers_movies = ['movie_id', 'movie_imdb_link', 'certificate', 'runtime_in_mins',
                          'genre','imdb_rating','number_of_votes','metascore', 'top_director']
    
    # turn into a dataframe    
    movies_dataframe = pd.DataFrame(movies_dict).T
    movies_dataframe.columns = headers_movies
    
    # add release_year column
    movies_dataframe['release_year']= year
    
    # remove any movie under 200 votes
    movies_dataframe['number_of_votes'] = pd.to_numeric(movies_dataframe['number_of_votes'])
    movies_dataframe = movies_dataframe[movies_dataframe['number_of_votes'] >= 200]
    
    #save the dataframe
    movies_dataframe.to_csv(f'movies_{year}.csv')
 
    return movies_dataframe



In [3]:
movies_2022 = movie_lister(2022)
movies_2021 = movie_lister(2021)
movies_2020 = movie_lister(2020)
movies_2019 = movie_lister(2019)


In [4]:
movies_2022.tail() 

Unnamed: 0,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
Call for Peace,tt19511880,https://www.imdb.com/title/tt19511880/?ref_=ad...,[],[],Reality-TV,9.1,204.0,[],Diane Paloma Eskenazi,2022
Do Not Disturb,tt10696116,https://www.imdb.com/title/tt10696116/?ref_=ad...,[],92,Horror,7.9,204.0,[],John Ainslie,2022
Matilda the Musical,tt3447590,https://www.imdb.com/title/tt3447590/?ref_=adv...,PG,117,"Comedy, Drama, Family",6.1,201.0,67,Matthew Warchus,2022
The Haunting of the Murder House,tt20861742,https://www.imdb.com/title/tt20861742/?ref_=ad...,[],78,Horror,4.5,200.0,[],Brendan Rudnicki,2022
Project Legion,tt12885770,https://www.imdb.com/title/tt12885770/?ref_=ad...,R,90,"Horror, Sci-Fi, Thriller",2.2,200.0,[],Lance Kawas,2022


In [None]:
# I am realizing that possibly many fields of the movies from 2022 will be empty as they are relatively new. 
# Awards, nominations, and oscars will be misleading. Therefore I decide excluding the year 2022 
# from building the model. I will use 2022 for validation.



In [15]:
#Scraping movies from earlier years

movies_2018 = movie_lister(2018)
movies_2017 = movie_lister(2017)


In [16]:
movies = pd.concat([movies_2021,movies_2020,movies_2019,movies_2018, movies_2017],axis = 0) 

movies = movies.reset_index()
movies = movies.rename(columns = {'index':'title'})


In [17]:
movies.head()

Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
0,Spider-Man: No Way Home,tt10872600,https://www.imdb.com/title/tt10872600/?ref_=ad...,PG-13,148,"Action, Adventure, Fantasy",8.3,739086,71,Jon Watts,2021
1,Dune,tt1160419,https://www.imdb.com/title/tt1160419/?ref_=adv...,PG-13,155,"Action, Adventure, Drama",8.0,621069,74,Denis Villeneuve,2021
2,Don't Look Up,tt11286314,https://www.imdb.com/title/tt11286314/?ref_=ad...,R,138,"Comedy, Drama, Sci-Fi",7.2,529981,49,Adam McKay,2021
3,No Time to Die,tt2382320,https://www.imdb.com/title/tt2382320/?ref_=adv...,PG-13,163,"Action, Adventure, Thriller",7.3,393930,68,Cary Joji Fukunaga,2021
4,Zack Snyder's Justice League,tt12361974,https://www.imdb.com/title/tt12361974/?ref_=ad...,R,242,"Action, Adventure, Fantasy",8.0,393504,54,Zack Snyder,2021


In [18]:
movies.shape

(4452, 11)

In [19]:
movies.to_csv('movies_2021_2017.csv')

In [13]:
# from csv import DictReader
# with open("movies_2022.csv", 'r') as f:
     
#     dict_reader = DictReader(f)
     
#     list_of_dict = list(dict_reader)
   
#     print(list_of_dict)

In [None]:
#Scraping movies from earlier years

In [22]:
movies_2016 = movie_lister(2016)
movies_2015 = movie_lister(2015)

In [27]:
#movies_2016.head()

In [28]:
movies_2016_2015 = pd.concat([movies_2015,movies_2016],axis = 0) 

movies_2016_2015 = movies_2016_2015.reset_index()
movies_2016_2015 = movies_2016_2015.rename(columns = {'index':'title'})
movies_2016_2015.shape


(1888, 11)

In [30]:
movies_2016_2015.tail(3)


Unnamed: 0,title,movie_id,movie_imdb_link,certificate,runtime_in_mins,genre,imdb_rating,number_of_votes,metascore,top_director,release_year
1885,The Haunting at Woodland Hills,tt1523575,https://www.imdb.com/title/tt1523575/?ref_=adv...,[],84,"Drama, Horror, Thriller",3.4,201,[],Jeremy Casper,2016
1886,Volumes of Blood: Horror Stories,tt6211920,https://www.imdb.com/title/tt6211920/?ref_=adv...,[],118,Horror,3.9,200,[],See full summary,2016
1887,Game of Aces,tt3687118,https://www.imdb.com/title/tt3687118/?ref_=adv...,R,97,"Action, Adventure, War",4.2,200,[],Damien Lay,2016


In [31]:
movies_2016_2015.to_csv('movies_2016_2015.csv')