# IMDB Scaper

### Importing the libraries

In [14]:
import pandas as pd
import numpy as np
import requests
from requests import get
from bs4 import BeautifulSoup
from time import sleep
from random import randint

### Scraping the IMDB webpages

In [1]:
headers = {"Accept-Language": "en-US,en;q=0.5"} # This bring us English-translated content from the URLs we’re requesting

pages = np.arange(1, 10001, 250) # Using this to stores each of our new URLs

movie_divs = [] # Emply list that will append the movie_div data

for page in pages: 
  
    # We use this to grab the contents of each URL
    page = requests.get('https://www.imdb.com/search/title/?title_type=feature,tv_series&count=250&start=' + str(page) + '&ref_=adv_nxt', headers = headers)
    
    # This will grab the text contents of page and use the HTML parser
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # We use this to store all of the div containers with a class of lister-item mode-advanced
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Storing the the above data for various pages
    movie_divs.append(movie_div)
    
    # The sleep() function will control the loop’s rate by pausing the execution of the loop for a specified amount of time
    ## The randint(2,10) function will vary the amount of waiting time between requests for a number between 2-10 seconds
    sleep(randint(2, 10))

### Extracting the required information

In [11]:
titles = []

descriptions = []

img_links =[]

for movie_div in movie_divs:
    
    for container in movie_div:
        
        # Scraping the movie names
        name = container.h3.a.text 
        titles.append(name)
        
        # Scraping the movie image links
        img_link = container.find('img', class_="loadlate").attrs['loadlate']
        img_links.append(img_link)
        
        # Scraping the descriptions
        description = container.find_all("p", class_="text-muted")[-1].text.lstrip()
        descriptions.append(description)

### Structuring & exporting the data

In [12]:
# Storing the data in a dataframe
movies = pd.DataFrame({
    'Movie': titles,
    'ImgLink': img_links,
    'Description': descriptions})

# Exporting the data to an Excel file
movies.to_excel('IMDB_Dataset.xlsx')