# IMDb Webscraping using BeautifulSoup
### 1. Importing necessary libraries

In [2]:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from time import sleep
import re
import pandas as pd
import sqlite3

### 2. Getting html of IMDb Top 250 movies

In [3]:
movie_url = 'https://www.imdb.com/chart/top?ref_=nv_mv_250'
uClient = uReq(movie_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")

### 3. Creating list of links to film pages

In [5]:
page_list = []
for i in page_soup.findAll("td",{"class":"titleColumn"}):
    page_list.append(i.a['href'])

In [7]:
len(page_list)

250

### 4. Scraping 250 films
\* _The try function has been put because not all films have all components. This can be avoided through determining which elements cause errors and decide on how to fill missing values, but because this was a test database, we decided to ommit the ones that caused problems. This resulted in info on 239 films._

In [7]:
film_items = []

for l in page_list:
    link = 'https://imdb.com'+ l
    uClient = uReq(link)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    
    try:
        pl_title = page_soup.title.string[:-14]
        title = str(page_soup.find('div',{'class':'originalTitle'}))[27:-56]

        print(title)

        if title=='':
            title=pl_title
            print('zmiana',title)

        id_title = '-'.join(re.split(r'\W+', title.lower()))
        description = page_soup.find('div',{'class':'summary_text'}).string.strip()
        year = page_soup.title.string[-12:-8]
        time = page_soup.time.string.strip()
        genre = page_soup.find('div',{'class':'title_wrapper'}).findAll('a')[1].string
        poster_url = page_soup.find('div',{'class':'poster'}).img['src']
        film_items.append((pl_title, title, id_title, description, year, time, genre, poster_url))

        for i in page_soup.findAll('div',{'class':'credit_summary_item'}):
            if i.h4.string=='Stars:':
                film_stars=[]
                for j in i.findAll('a')[:-1]:  
                    film_stars.append(j.string)
                film_stars=", ".join(film_stars)
            else:
                film_items[-1] = film_items[-1] + (str(i.a.string),)
        film_items[-1] = (None, ) + film_items[-1] + (film_stars,)
    except Exception:

        pl_title = page_soup.title.string[:-14]
        title = str(page_soup.find('div',{'class':'originalTitle'}))[27:-56]
        print('nie dziala', pl_title, title)

One Flew Over the Cuckoo's Nest
Goodfellas
The Matrix
Shichinin no samurai
Cidade de Deus
Se7en
Star Wars
The Silence of the Lambs
It's a Wonderful Life
La vita è bella
The Usual Suspects
Sen to Chihiro no kamikakushi
Saving Private Ryan
Spider-Man: Into the Spider-Verse
Léon
The Green Mile

zmiana Interstellar
Psycho
American History X
City Lights
C'era una volta il West

zmiana Casablanca
Modern Times
The Pianist
Intouchables
The Departed
Terminator 2: Judgment Day
Back to the Future

zmiana Whiplash
Rear Window
Raiders of the Lost Ark
nie dziala Poszukiwacze zaginionej arki Raiders of the Lost Ark
The Lion King

zmiana Gladiator
The Prestige
Apocalypse Now

zmiana Memento
Alien
The Great Dictator
Nuovo Cinema Paradiso
Hotaru no haka
Avengers: Infinity War
Sunset Blvd.
Das Leben der Anderen
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
Paths of Glory
The Shining
Django Unchained

zmiana WALL·E
Mononoke-hime
Witness for the Prosecution

zmiana American Beauty
Th

### 5. Inserting scraped films into database

In [10]:
conn = sqlite3.connect('project.db')
c = conn.cursor()
c.executemany('INSERT INTO films VALUES (?,?,?,?,?,?,?,?,?,?,?,?)', film_items)
conn.commit()
conn.close()

### 6. Viewing added films

In [12]:
conn = sqlite3.connect('project.db')
c = conn.cursor()
c.execute('SELECT * FROM films')

listOfResults=c.fetchall()
film_dataframe = pd.DataFrame(listOfResults)

conn.commit()
conn.close()

film_dataframe.columns=['id_film','name','pl_title','title','description','year','time','genre','poster_url','director','writer','stars']
film_dataframe.index=film_dataframe.index+1
film_dataframe.loc[1,:]

id_film                                                        1
name                                        Skazani na Shawshank
pl_title                                The Shawshank Redemption
title                                   the-shawshank-redemption
description    Two imprisoned men bond over a number of years...
year                                                        1994
time                                                    2h 22min
genre                                                      Drama
poster_url     https://m.media-amazon.com/images/M/MV5BMDFkYT...
director                                          Frank Darabont
writer                                              Stephen King
stars                    Tim Robbins, Morgan Freeman, Bob Gunton
Name: 1, dtype: object