# Web Scraping
>
> The purpose of this notebook is to web scrape movies informations on IMDB. The data collected are : 
> - title
> - year
> - categroy
> - country
> - actor
> - director
> - grade
> - length
> - number of votes

In [20]:
from time import sleep, time
from random import randint
from IPython.display import clear_output
import numpy as np
from tqdm import tqdm
import requests
from requests import get
from bs4 import BeautifulSoup
import numpy as np
import re

url_base = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2020-12-31&start='
start = np.arange(1, 10000, 50)

title = []
year = []
certif = []
runtime = []
cat = []
grade = []
nb_vote = []
country = []
language = []
director = []
star = []
request = 0
start_time =  time()

for i in tqdm(start): # iterating over the pages
    
    url = url_base + str(i)
    
    #get the website
    site  = get(url)
    
    #break - avoid overloading the site
    print('\n')
    print("break...")
    sleep(randint(8,15))
    print("finished \n")
    
    
    #update the number of requests
    request += 1
    elapsed_time = time()-start_time
    print('Request: {}; Frequency {} requests/s'.format(request, request/elapsed_time))
    clear_output(wait=True)
    
    #error msg
    if site.status_code != 200:
        warn("Request: {}, Stauts code: {}".format(request, site.status_code))
        
    
    #BeautifulSoup
    parser = BeautifulSoup(site.text, 'html.parser')
    
    
    for movie in parser.find_all(class_ = "lister-item mode-advanced"):
        
        if movie.find("strong") is not None and movie.find("span", class_="certificate") is not None:
        
            #title
            title.append(movie.find("h3", class_="lister-item-header").find("a").text )
        
            #year
            if movie.find("span", class_="lister-item-year text-muted unbold") is not None:
                year.append( movie.find("span", class_="lister-item-year text-muted unbold").text[1:-1] )
                
            else:
                year.append(None)
        
            #certif
            certif.append( movie.find("span", class_="certificate").text ) 
        
            #length
            if movie.find("span", class_="runtime") is not None:
                runtime.append( int(movie.find("span", class_="runtime").text.split()[0]) )
            
            else:
                runtime.append(None)
        
        
            #category
            if movie.find("span", class_="genre") is not None:
                cat.append( movie.find("span", class_="genre").text.split(",")[0].split()[0] ) 
            
            else:
                cat.append(None)
        
            #grade
            grade.append( float(movie.find("strong").text) ) 
        
            #nb of votes
            if movie.find("span", attrs = {"name": "nv"}) is not None:
                nb_vote.append( int(movie.find("span", attrs = {"name": "nv"})["data-value"]) ) 
            
            else:
                nb_vote.append(None)
        
        
            #moove to the movie's webpage
            link_movie = "https://www.imdb.com" + movie.find("h3", class_="lister-item-header").find("a")["href"]
            parser_movie = BeautifulSoup(get(link_movie).text, 'html.parser')
        
            #Country
            if parser_movie.find("a", href=re.compile("country")) is not None:
                country.append( parser_movie.find("a", href=re.compile("country")).text )
            
            else:
                country.append(None)
        
            #Language
            if parser_movie.find("a", href=re.compile("language")) is not None:
                language.append( parser_movie.find("a", href=re.compile("language")).text ) 
            
            else:
                language.append(None)
        
        
            #director and actor
            casting = parser_movie.find_all("div", class_ = "credit_summary_item")
            if len(casting) == 3:
                director.append( casting[0].a.text )
                star.append( casting[2].a.text ) 
            
            else:
                director.append( casting[0].a.text )
                star.append( casting[1].a.text ) 
            

100%|██████████| 200/200 [2:42:58<00:00, 48.89s/it]


# Create the DataBase
>
> Once the data collected, we will create the database `imdb.db` on SQLite. It will contain 4 tables with the data

In [22]:
import sqlite3
connexion = sqlite3.connect("imdb.db")
connexion.execute("PRAGMA foreign_keys = on;")
c = connexion.cursor()

## Create the tables
> **Category** : the categories of the movie

In [23]:
query = ("CREATE TABLE Category("
        "id integer PRIMARY KEY,"
        "category text);"
        )

c.execute(query)
connexion.commit()

> **Film** : informations about the movies. To distinguish the categories, there is the foreign key `category_id` linked to the column `id` of the table `category`

In [24]:
query = ("CREATE TABLE Film("
         "id integer PRIMARY KEY,"
         "title text, "
         "grade real, "
         "length integer, "
         "nb_vote integer,"
         "language text,"
         "country text,"
         "category_id integer,"
         "FOREIGN KEY(category_id) REFERENCES Category(id));"
        )

c.execute(query)
connexion.commit()

> **Actor**: all the actors of the database

In [25]:
query = ("CREATE TABLE Actor("
        "id integer PRIMARY KEY,"
        "actor text);"
        )

c.execute(query)
connexion.commit()

> **Director**: all the directors of the database

In [26]:
query = ("CREATE TABLE Director("
        "id integer PRIMARY KEY,"
        "director text);"
        )

c.execute(query)
connexion.commit()

> **Casting**: an intermediate table to link each movie to his director (`Director`) and his actor (`Actor`)

In [27]:
query = ("CREATE TABLE Casting("
        "id integer PRIMARY KEY,"
        "id_title integer,"
        "actor_id integer, "
        "director_id integer,"
        "FOREIGN KEY(id_title) REFERENCES Film(id), "
        "FOREIGN KEY(actor_id) REFERENCES Actor(id),"
        "FOREIGN KEY(director_id) REFERENCES Director(id));"
        )

c.execute(query)
connexion.commit()

## Insert values

In [28]:
#Category
i = 0
category2id = dict()
for category in np.unique(cat): 
    category2id[category] = i
    item = (i, category)
    c.execute('INSERT INTO Category values (?,?)', item)
    i +=1


#Film
i = 0
for titre, note, duree, votes, pays, langue, category in zip(title, grade, runtime, nb_vote, country, language, cat):
    item = (i, titre, note, duree, votes, pays, langue, category2id[category])
    c.execute('INSERT INTO Film values (?,?,?,?,?,?,?,?)', item)
    i +=1


#Actor
i = 0
actor2id = dict()
id2actor = dict()
for actor in np.unique(star):
    actor2id[actor] = i
    id2actor[i] = actor
    item = (i, actor)
    c.execute('INSERT INTO Actor values (?,?)', item)
    i +=1


#Director
i = 0
director2id = dict()
id2director = dict()
for dire in np.unique(director):
    director2id[dire] = i
    id2director[i] = dire
    item = (i, dire)
    c.execute('INSERT INTO Director values (?,?)', item)
    i +=1


#Casting
i = 0
for titre, actor, dire in zip(title, star, director):
    item = (i, title.index(titre), actor2id[actor], director2id[dire])
    c.execute('INSERT INTO Casting values (?,?,?,?)', item)  
    i +=1


connexion.commit()