In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests


In [2]:
# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

In [3]:
# Create the Garbage class
class imdb_movie(Base):
    __tablename__ = 'imdb_top_250_movies'
    title = Column(String(30), primary_key=True)
    movie_name = Column(String(500))


In [4]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///imdb_top_250_moviesDB.db')
#engine = create_engine(f'postgresql://postgres:password@localhost/GarbageDB')
Base.metadata.create_all(engine)

In [5]:
# To push the objects made and query the server we use a Session object
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [6]:
url="https://www.imdb.com/chart/top/?ref_=nv_mv_250"

In [7]:
# Retrieve page with the requests module
response = requests.get(url)

In [8]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [9]:
# Examine the results, then determine element that contains sought info
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Top 250 Movies - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/chart/top/" rel="canonical"/>
  <meta content="http://w

In [10]:
# results are returned as an iterable list
top_250_results = soup.find_all('td', class_="titleColumn")

In [11]:
# Loop through returned results
for result in top_250_results:
    # Error handling
    try:
        # Identify and return title of listing
        title_name = result.find('a').text
        # Identify and return price of listing
        # Identify and return link to listing
        link = result.a['href']
        title_strings = link.split('/')
        

        # Print results only if title, price, and link are available
        if (title_name and link):
            print('-------------')
            print(title_name)
            print(link)
            print(title_strings[2])
            imdb_title_id = title_strings[2]
            # Create  instances of the imdb movie class
            movie = imdb_movie(title=imdb_title_id, movie_name=title_name)
            # Add these objects to the session
            session.add(movie)
                                   
    except AttributeError as e:
        print(e)


-------------
The Shawshank Redemption
/title/tt0111161/
tt0111161
-------------
The Godfather
/title/tt0068646/
tt0068646
-------------
The Godfather: Part II
/title/tt0071562/
tt0071562
-------------
The Dark Knight
/title/tt0468569/
tt0468569
-------------
12 Angry Men
/title/tt0050083/
tt0050083
-------------
Schindler's List
/title/tt0108052/
tt0108052
-------------
The Lord of the Rings: The Return of the King
/title/tt0167260/
tt0167260
-------------
Pulp Fiction
/title/tt0110912/
tt0110912
-------------
The Good, the Bad and the Ugly
/title/tt0060196/
tt0060196
-------------
The Lord of the Rings: The Fellowship of the Ring
/title/tt0120737/
tt0120737
-------------
Fight Club
/title/tt0137523/
tt0137523
-------------
Forrest Gump
/title/tt0109830/
tt0109830
-------------
Inception
/title/tt1375666/
tt1375666
-------------
The Lord of the Rings: The Two Towers
/title/tt0167261/
tt0167261
-------------
Star Wars: Episode V - The Empire Strikes Back
/title/tt0080684/
tt0080684
---

/title/tt0113247/
tt0113247
-------------
The 400 Blows
/title/tt0053198/
tt0053198
-------------
A Silent Voice: The Movie
/title/tt5323662/
tt5323662
-------------
Platoon
/title/tt0091763/
tt0091763
-------------
Spotlight
/title/tt1895587/
tt1895587
-------------
Monsters, Inc.
/title/tt0198781/
tt0198781
-------------
Rebecca
/title/tt0032976/
tt0032976
-------------
Life of Brian
/title/tt0079470/
tt0079470
-------------
Hotel Rwanda
/title/tt0395169/
tt0395169
-------------
The Bandit
/title/tt0116231/
tt0116231
-------------
In the Mood for Love
/title/tt0118694/
tt0118694
-------------
Rush
/title/tt1979320/
tt1979320
-------------
Into the Wild
/title/tt0758758/
tt0758758
-------------
Rocky
/title/tt0075148/
tt0075148
-------------
Amores perros
/title/tt0245712/
tt0245712
-------------
Nausicaä of the Valley of the Wind
/title/tt0087544/
tt0087544
-------------
96
/title/tt7019842/
tt7019842
-------------
Andrei Rublev
/title/tt0060107/
tt0060107
-------------
It Happened O

In [12]:
session.commit()

session.close()