# IMDb Top 250 Movies Scraping

## Import required libraries

In [13]:
import requests
from bs4 import BeautifulSoup  
import re
import csv

## Open csv file

In [4]:
csv_file = open('imdbtop250.csv', 'w', encoding='UTF8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['rank', 'title', 'year', 'director', 'duration(minutes)', 'genre', 'rating', 'gross', 'oscars', 'awards', 'nominations'])

89

## Get URL

In [5]:
url = 'https://www.imdb.com/chart/top'
response = requests.get(url)
response.raise_for_status() #error   

In [6]:
soup = BeautifulSoup(response.text, "html.parser")

## Function to get info from individual movie links

In [7]:
def more_info(url):
    detail = requests.get(url)
    soup = BeautifulSoup(detail.text, "html.parser")
    style = soup.find('div', class_='sc-910a7330-4 kcpPzf')
    if style:
        gd = style
    else:
        gd = soup.find('div', class_='sc-fa02f843-0 fjLeDR')
    
    genres = []
    oscars = []
    
    # Genre
    if gd == style:
        for genre in gd.find_all('span', class_='ipc-chip__text'): genres.append(genre.text)
    else: 
        for genre in soup.find('div', class_='ipc-chip-list__scroller').find_all('span', class_='ipc-chip__text'): genres.append(genre.text)

    # Gross
    try:
        gross_ = soup.find_all('ul', class_='ipc-metadata-list ipc-metadata-list--dividers-none ipc-metadata-list--compact sc-6d4f3f8c-0 ejRbxb ipc-metadata-list--base')[0].find_all('li', class_='ipc-inline-list__item')[-1].text.replace('$','').replace(',','')
        gross = int(gross_)
    except:
        gross = None
    # Director 
    director = gd.find('li', class_='ipc-inline-list__item').text
    
    # Runtime in minutes
    runtime = soup.find('div', class_='sc-94726ce4-3 eSKKHi').ul.find_all('li', class_='ipc-inline-list__item')[2].text
    time = re.findall('\d+', runtime)
    if len(time) > 1: minutes = int(time[0])*60 + int(time[1]) 
    else:
        if len(time[0])<2: minutes = int(time[0])*60 
        else: minutes = int(time[0])
    
    award = soup.find('div', class_='sc-fcdc3619-0 YgLMu base').ul.text
    num = re.findall('\d+', award)
    if award.split(' ')[0] == 'Won': oscars.append(int(num[0][0]))
    elif award.split(' ')[0] == 'Nominated': oscars.append(0)
    else: oscars.append(0) # if there are no nominations 
   
    return genres, director, minutes, gross, oscars[0], num[len(num)-2], num[len(num)-1]

## Main page scraping

In [8]:
movies = soup.find('tbody', class_='lister-list').find_all('tr')
for movie in movies:
    rank = movie.find('td', class_='titleColumn').get_text(strip=True).split('.')[0]
    name = movie.find('td', class_='titleColumn').a.text
    year = movie.find('td', class_='titleColumn').span.text.replace('(','').replace(')','')
    rating = movie.find('td', class_='ratingColumn imdbRating').strong.text
    
    base = 'https://www.imdb.com/'
    href = movie.find('td', class_='titleColumn').a.get('href')
    url = base+href  
    
    genres, director, minutes, gross, oscars, awards, nominations = more_info(url)
    
    csv_writer.writerow([rank, name, year, director, minutes, genres, rating, gross, oscars, awards, nominations])

## Close csv file

In [9]:
csv_file.close()