# Web Scraping Module With Beautiful Soup

## Import the Packages 

In [22]:
import pandas as pd ## To convert into data frame
import numpy as np
from time import sleep
import requests ## To generate the request to download HTML data 
from bs4  import BeautifulSoup as bsoup ## Beautiful Soup to parse the HTML files 

___________________________________________________________________________________________________________________________

#### Saving the URL

In [27]:
#Declaring variables to store the data
title = [] # The title of the movie
years = [] # Year of release
time = [] # duration in minutes
imdb_ratings = [] # ratings
metascores = [] # metascores
votes = [] # Number of votes
us_gross = [] # Gross collection

page=np.arange(1,52,50)

for i in page:
    container=requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(i) +"ref_=adv_prv",timeout=10).text
    container_bsoup=bsoup(container,'lxml')
    
    rev=container_bsoup.find_all('div',{'class':'lister-item mode-advanced'}) # Contains the html data 

    # Looping to extract features
    for r in rev:
        #Title
        name=r.h3.a.text
        title.append(name)
    
        # Year
        year=int((r.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text).replace("(","").replace(")","").replace("I","").strip())
        years.append(year)
    
        #Duration
        leng=int((r.p.find('span',{'class':'runtime'}).text).replace("min","").strip())
        time.append(leng)
    
        #IMDB Rating
        rating=float(r.strong.text)
        imdb_ratings.append(rating)
        
        #Metascore
        metascore=int((r.find('div',{'class':'inline-block ratings-metascore'}).span.text).rstrip())
        metascores.append(metascore)
        
        #Since we have same tags with same name and class hence collecting all the common tags 
        nv = r.find_all('span', attrs={'name': 'nv'})
        
        #Vote
        vote = nv[0].text
        votes.append(vote)
        
        #Gross Collection
        ## Condition to check the missing gross value
        grosses = nv[1].text if len(nv) > 1 else '-'
        us_gross.append(grosses)
    sleep(50)

____________________________________________________________________________________________________________________________

#### Creating DataFrame

In [28]:
#Converting into Data Frame
imdb_movies=pd.DataFrame({'movie': title,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross})

___________________________________________________________________________________________________________________________

#### Data Inspection

In [29]:
#Data Frame
imdb_movies

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,Knives Out,2019,130,7.9,82,340943,$165.36M
1,Once Upon a Time... in Hollywood,2019,161,7.7,83,487755,$142.50M
2,The Gentlemen,2019,113,7.9,51,157213,-
3,Gisaengchung,2019,132,8.6,96,441832,$53.37M
4,Ford v. Ferrari,2019,152,8.1,81,231742,$117.62M
...,...,...,...,...,...,...,...
95,Spider-Man: Into the Spider-Verse,2018,117,8.4,87,326337,$190.24M
96,Thor: Ragnarok,2017,130,7.9,74,557491,$315.06M
97,Guardians of the Galaxy,2014,121,8.0,76,1017334,$333.18M
98,The Dark Knight Rises,2012,164,8.4,78,1470202,$448.14M


### We can see that we have a list of 100 movies from 2 pages as each page consists of 50 movies each.
- I have extracted data from two pages because of the time constraint
- We can randomly give time in sleep function if we need to extract from more pages. e.g. sleep(np.random.randint(20,40))

___________________________________________________________________________________________________________________________

# THANK YOU