# Web Scraping
https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating

In [None]:
# pip install selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import pandas as pd
import numpy as np
import time


def get_movies(num_movies, path):
    
    #Initializing the webdriver
    options = Options()
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(chrome_options=options, executable_path=path)

    driver.set_window_size(1120, 1000)

    #Open Specified URL with keyword you want to search for
    URL = "https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating"
    driver.get(URL)
    
    movies = []
    ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
    

    while len(movies) < num_movies:
        
        try:
            #Wait until the web page load
            element = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions).until(
                    EC.presence_of_element_located((By.ID, "main"))
                    )
        
            movie_button = element.find_elements_by_xpath('.//div[@class="lister-item mode-advanced"]')

            for movie in movie_button:

                print("Progress: {}".format("" + str(len(movies)) + "/" + str(num_movies)))
                if len(movies) >= num_movies:
                    break

                try:
                    title = movie.find_element_by_xpath('.//h3[@class="lister-item-header"]').text
                except NoSuchElementException:
                    title = np.NaN
                
                try: 
                    certificate = movie.find_element_by_xpath('.//span[@class="certificate"]').text
                except NoSuchElementException:
                    certificate = np.NaN
                    
                try:    
                    duration = movie.find_element_by_xpath('.//span[@class="runtime"]').text
                except NoSuchElementException:
                    duration = np.NAN
                    
                try:
                    genre = movie.find_element_by_xpath('.//span[@class="genre"]').text
                except NoSuchElementException:
                    genre = np.NaN
                    
                try:
                    rate = movie.find_element_by_xpath('.//strong').text 
                except NoSuchElementException:
                    rate = np.NaN
                
                try: 
                    metascore = movie.find_element_by_xpath('.//span[@class="metascore  favorable"]').text
                except NoSuchElementException:
                    metascore = np.NaN
                    
                try:
                    descr = movie.find_element_by_xpath('.//p[@class="text-muted"]').text
                except NoSuchElementException:
                    descr = np.NaN
                
                try:
                    cast = movie.find_element_by_xpath('.//p[@class=""]').text
                except NoSuchElementException:
                    cast = np.NaN
                
                try:
                    info = movie.find_element_by_xpath('.//p[@class="sort-num_votes-visible"]').text
                except NoSuchElementException:
                    info = np.NaN
                
                movies.append({
                    "Title": title,
                    "Certificate": certificate,
                    "Duration": duration,
                    "Genre": genre,
                    "Rate": rate,
                    "Metascore": metascore,
                    "Description": descr,
                    "Cast": cast,
                    "Info": info
                    })
            try:
                if len(movies) < num_movies:
                    element.find_element_by_xpath('.//a[@class="lister-page-next next-page"]').click()
                    time.sleep(20)
                
            except NoSuchElementException:
                print("Scraping terminated before reaching target number of movies. Needed {}, got {}.".format(num_movies, len(movies)))
                driver.quit()
                break   
            
        finally:
            if len(movies) >= num_movies:
                driver.quit()

    return pd.DataFrame(data=movies)

PATH = "C:\Program Files (x86)\chromedriver.exe"
df = get_movies(1000, PATH)
df.to_csv("IMDB top 1000.csv")

# Data Cleaning

In [326]:
df = pd.read_csv("IMDB top 1000.csv" ,index_col=[0])
df.head()

Unnamed: 0,Title,Certificate,Duration,Genre,Rate,Metascore,Description,Cast,Info
0,1. Esaretin Bedeli (1994),13+,142 min,Drama,93,80.0,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...",Votes: 2.377.420 | Gross: $28.34M
1,2. Baba (1972),16,175 min,"Crime, Drama",92,100.0,An organized crime dynasty's aging patriarch t...,Director: Francis Ford Coppola | Stars: Marlon...,Votes: 1.646.772 | Gross: $134.97M
2,3. Kara Şövalye (2008),16,152 min,"Action, Crime, Drama",90,84.0,When the menace known as the Joker wreaks havo...,Director: Christopher Nolan | Stars: Christian...,Votes: 2.341.191 | Gross: $534.86M
3,4. Baba 2 (1974),12,202 min,"Crime, Drama",90,90.0,The early life and career of Vito Corleone in ...,Director: Francis Ford Coppola | Stars: Al Pac...,Votes: 1.146.636 | Gross: $57.30M
4,5. 12 Öfkeli Adam (1957),14,96 min,"Crime, Drama",90,96.0,A jury holdout attempts to prevent a miscarria...,"Director: Sidney Lumet | Stars: Henry Fonda, L...",Votes: 701.867 | Gross: $4.36M


In [327]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        1000 non-null   object 
 1   Certificate  891 non-null    object 
 2   Duration     1000 non-null   object 
 3   Genre        1000 non-null   object 
 4   Rate         1000 non-null   object 
 5   Metascore    712 non-null    float64
 6   Description  1000 non-null   object 
 7   Cast         1000 non-null   object 
 8   Info         1000 non-null   object 
dtypes: float64(1), object(8)
memory usage: 78.1+ KB


In [328]:
df.Duration = df.Duration.str.extract("""(\d*) min""")[0]
df.Duration = df.Duration.astype(int)
df.Rate = df.Rate.apply(lambda x: '.'.join(x.split(','))).astype('float64')

In [329]:
df = df.drop(['Certificate','Metascore'], axis=1)
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Cast,Info
0,1. Esaretin Bedeli (1994),142,Drama,9.3,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...",Votes: 2.377.420 | Gross: $28.34M
1,2. Baba (1972),175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Director: Francis Ford Coppola | Stars: Marlon...,Votes: 1.646.772 | Gross: $134.97M


In [330]:
df['Year'] = df.Title.str.extract('((\d\d\d\d))')[0].astype(int)
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Cast,Info,Year
0,1. Esaretin Bedeli (1994),142,Drama,9.3,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...",Votes: 2.377.420 | Gross: $28.34M,1994
1,2. Baba (1972),175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Director: Francis Ford Coppola | Stars: Marlon...,Votes: 1.646.772 | Gross: $134.97M,1972


In [331]:
df['Title'] = df.Title.str.extract("""\d\. ([a-zA-Z0-9ıİöÖçÇşŞğĞüÜâôÔ·,?!-_': ]*) \(\d{4}\)""")[0]
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Cast,Info,Year
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...",Votes: 2.377.420 | Gross: $28.34M,1994
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Director: Francis Ford Coppola | Stars: Marlon...,Votes: 1.646.772 | Gross: $134.97M,1972


In [332]:
df['Directors'] = df.Cast.str.extract("""Director\w*: (([a-zA-ZıİöÖçÇşŞğĞüÜâáôÔéÉ_\,.\-' ]*)) |""")[0]
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Cast,Info,Year,Directors
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...",Votes: 2.377.420 | Gross: $28.34M,1994,Frank Darabont
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Director: Francis Ford Coppola | Stars: Marlon...,Votes: 1.646.772 | Gross: $134.97M,1972,Francis Ford Coppola


In [333]:
df['Stars'] = df.Cast.str.extract("""Stars: (([a-zA-ZıİöÖçÇşŞğĞüÜâáôÔéÉ_\,.\-' ]*))""")[0]
df = df.drop('Cast', axis=1)
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Info,Year,Directors,Stars
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,Votes: 2.377.420 | Gross: $28.34M,1994,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Votes: 1.646.772 | Gross: $134.97M,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."


In [334]:
df['Votes'] = df.Info.str.extract("""Votes: ([0-9. ]*)""")[0].apply(lambda x:x.strip()).apply(lambda x: ''.join(x.split('.'))).astype(int)
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Info,Year,Directors,Stars,Votes
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,Votes: 2.377.420 | Gross: $28.34M,1994,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2377420
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Votes: 1.646.772 | Gross: $134.97M,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1646772


In [335]:

df['Gross($M)'] = df.Info.str.extract("""Gross: .([0-9. ]*)M""")[0]
df = df.fillna('None')
df['Gross($M)'] = df['Gross($M)'].apply(lambda x: ''.join(x.split('.')))
df['Gross($M)'] = df['Gross($M)'].replace({'None':np.nan})
df['Gross($M)'] = df['Gross($M)'].astype('float64')
df = df.drop('Info', axis=1)
df.head(2)

Unnamed: 0,Title,Duration,Genre,Rate,Description,Year,Directors,Stars,Votes,Gross($M)
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,1994,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2377420,2834.0
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1646772,13497.0


In [336]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        1000 non-null   object 
 1   Duration     1000 non-null   int32  
 2   Genre        1000 non-null   object 
 3   Rate         1000 non-null   float64
 4   Description  1000 non-null   object 
 5   Year         1000 non-null   int32  
 6   Directors    1000 non-null   object 
 7   Stars        1000 non-null   object 
 8   Votes        1000 non-null   int32  
 9   Gross($M)    820 non-null    float64
dtypes: float64(2), int32(3), object(5)
memory usage: 74.2+ KB


In [337]:
df

Unnamed: 0,Title,Duration,Genre,Rate,Description,Year,Directors,Stars,Votes,Gross($M)
0,Esaretin Bedeli,142,Drama,9.3,Two imprisoned men bond over a number of years...,1994,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2377420,2834.0
1,Baba,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1646772,13497.0
2,Kara Şövalye,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,2008,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",2341191,53486.0
3,Baba 2,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,1974,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall, Dian...",1146636,5730.0
4,12 Öfkeli Adam,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,1957,Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam, John ...",701867,436.0
...,...,...,...,...,...,...,...,...,...,...
995,Persona,85,"Drama, Thriller",8.1,A nurse is put in charge of a mute actress and...,1966,Ingmar Bergman,"Bibi Andersson, Liv Ullmann, Margaretha Krook,...",105591,
996,Andrei Rublev,205,"Biography, Drama, History",8.1,"The life, times and afflictions of the fifteen...",1966,Andrei Tarkovsky,"Anatoliy Solonitsyn, Ivan Lapikov, Nikolay Gri...",48104,10.0
997,Cezayir Savaşı,121,"Drama, War",8.1,"In the 1950s, fear and violence escalate as th...",1966,Gillo Pontecorvo,"Brahim Hadjadj, Jean Martin, Yacef Saadi, Sami...",54088,6.0
998,Yok Edici Melek,95,"Drama, Fantasy",8.1,The guests at an upper-class dinner party find...,1962,Luis,"Silvia Pinal, Jacqueline Andere, Enrique Ramba...",30160,


In [338]:
df.to_csv("IMDB top 1000 (cleaned).csv")