# Web Scraping Module With Beautiful Soup

## Import the Packages 

In [2]:
import pandas as pd ## To convert into data frame
import requests ## To generate the request to download HTML data 
from bs4  import BeautifulSoup as bsoup ## Beautiful Soup to parse the HTML files 

___________________________________________________________________________________________________________________________

#### Saving the URL

In [3]:
url='https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv'
container=requests.get(url).text
container_bsoup=bsoup(container,'lxml')

In [4]:
print(container_bsoup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 1000"
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "Loa

___________________________________________________________________________________________________________________________

#### Extracting Various Features related to the Movies

In [5]:
title = [] # The title of the movie 
years = [] # Year of release
time = [] # duration in minutes
imdb_ratings = [] # ratings
metascores = [] # metascores
votes = [] # Number of votes
us_gross = [] # Gross collection

rev=container_bsoup.find_all('div',{'class':'lister-item mode-advanced'}) # Contains the html data 

# Looping to extract features
for r in rev:
    #Title
    name=r.h3.a.text
    title.append(name)
    
    # Year
    year=int((r.h3.find('span',{'class':'lister-item-year text-muted unbold'}).text).replace("(","").replace(")","").replace("I","").strip())
    years.append(year)
    
    #Duration
    leng=int((r.p.find('span',{'class':'runtime'}).text).replace("min","").strip())
    time.append(leng)
    
    #IMDB Rating
    rating=float(r.strong.text)
    imdb_ratings.append(rating)
    
    #Metascore
    metascore=int((r.find('div',{'class':'inline-block ratings-metascore'}).span.text).rstrip())
    metascores.append(metascore)
    
    #Since we have same tags with same name and class hence collecting all the common tags 
    nv = r.find_all('span', attrs={'name': 'nv'})
    
    #Vote
    vote = nv[0].text
    votes.append(vote)
    
    #Gross Collection
    ## Condition to check the missing gross value
    grosses = nv[1].text if len(nv) > 1 else '-'
    us_gross.append(grosses)

____________________________________________________________________________________________________________________________

#### Creating DataFrame

In [11]:
#Converting into Data Frame
imdb_movies=pd.DataFrame({'movie': title,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross})

___________________________________________________________________________________________________________________________

#### Data Inspection

In [13]:
#Data Frame
imdb_movies.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,Knives Out,2019,130,7.9,82,340510,$165.36M
1,Once Upon a Time... in Hollywood,2019,161,7.7,83,487522,$142.50M
2,The Gentlemen,2019,113,7.9,51,156925,-
3,Gisaengchung,2019,132,8.6,96,441501,$53.37M
4,Ford v. Ferrari,2019,152,8.1,81,231446,$117.62M


___________________________________________________________________________________________________________________________

# THANK YOU