In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pymongo
import pandas as pd
import requests
from sqlalchemy import create_engine

In [2]:
# open browser
browser = Browser('chrome')
url = 'http://quotes.toscrape.com/'

In [3]:
# get author born, 
def quote_author(url):
    author_info = {}
    
    # request author html data
    authorPage = requests.get(url).text
    
    # parse response
    author = BeautifulSoup(authorPage, 'html.parser')
    
    # find author details
    author_info['name'] = author.find('h3', class_ = 'author-title').text
    author_info['birthday'] = author.find('span', class_ = 'author-born-date').text
    author_info['location'] = author.find('span', class_ = 'author-born-location').text
    author_info['description'] = author.find('div', class_ = 'author-description').text.strip()
    
    return author_info

In [4]:
# get tags
def get_tags(quote):
    tagList = []
    
    quoteTags = quote.find_all('a', class_='tag')
    
    for tag in quoteTags:
        tagList.append(tag.text)
    
    return tagList

In [5]:
# get quote info
def quote_data(quote):
    quote_info = {}
    quote_info['quote_text'] = quote.find('span', class_='text').text
    auth_url = urljoin(url, quote.find('a')['href'])
    
    # get author data
    quote_info['author'] = quote_author(auth_url)
    
    # get tag data
    quote_info['tag'] = get_tags(quote)
    
    return quote_info

In [6]:
# get list of quote boxes
def get_quoteList(pageNumber):
    quoteList = []
    
    # get html data from current page
    html = browser.html
    
    # use BeautifulSoup to parse html data
    soup = BeautifulSoup(html, 'html.parser')
    
    # find all quote blocks
    quotes = soup.find_all('div', class_='quote')
    
    # initialize quote ids
    quoteID = (pageNumber - 1) * 10
    
    # for each quote in the list retuned
    for quote in quotes:
        # increment quote id
        quoteID += 1
        
        # get all quote data returned as a dictionary
        quote_dic = quote_data(quote)
        quote_dic['_id'] = quoteID
        
        quoteList.append(quote_dic)
    
#         quoteList[0]: {_id: 1,
#                         quote_text: ,
#                         author:{name:
#                                 birthday:      
#                                 born:
#                                 description:}
#                         tags: []
#                       }
    
    return quoteList

In [7]:
# scrape everything
def scrap_everything(url):
    all_quotes = []
    
    first_iterations = True
    nextPage = True
    pageNumber = 1
    
    while nextPage == True:
        
        # if this is the first time through the while loop, navigate to url 
        if first_iterations == True:
            browser.visit(url)
            first_iterations = False
        else:
            pass
        
        # get all quote data from the page
        print(f'Scraping Page {pageNumber}')
        currentList = get_quoteList(pageNumber)
        all_quotes =  all_quotes + currentList
        
        try:
            browser.links.find_by_partial_text('Next').click()
            pageNumber += 1  
        except:
            print("Scraping Complete")
            nextPage = False
    
    return all_quotes
    

In [8]:
data = scrap_everything(url)

Scraping Page 1
Scraping Page 2
Scraping Page 3
Scraping Page 4
Scraping Page 5
Scraping Page 6
Scraping Page 7
Scraping Page 8
Scraping Page 9
Scraping Page 10
Scraping Complete


In [29]:
browser.quit()

### Send data to MongoDB

In [9]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [10]:
# Define database and collection
db = client.quoteslist_db
collection = db.items

In [11]:
# Dictionary to be inserted as a MongoDB document
collection.insert_many(data)


<pymongo.results.InsertManyResult at 0x19cc20dfe40>

### Move data from  MongoDB to put into postgres

In [12]:
# mongo_data = db.mongo_data
mongo_df = pd.DataFrame(list(collection.find()))
mongo_df

Unnamed: 0,_id,quote_text,author,tag
0,1,“The world as we have created it is a process ...,"{'name': 'Albert Einstein', 'birthday': 'March...","[change, deep-thoughts, thinking, world]"
1,2,"“It is our choices, Harry, that show what we t...","{'name': 'J.K. Rowling', 'birthday': 'July 31,...","[abilities, choices]"
2,3,“There are only two ways to live your life. On...,"{'name': 'Albert Einstein', 'birthday': 'March...","[inspirational, life, live, miracle, miracles]"
3,4,"“The person, be it gentleman or lady, who has ...","{'name': 'Jane Austen', 'birthday': 'December ...","[aliteracy, books, classic, humor]"
4,5,"“Imperfection is beauty, madness is genius and...","{'name': 'Marilyn Monroe', 'birthday': 'June 0...","[be-yourself, inspirational]"
...,...,...,...,...
95,96,“You never really understand a person until yo...,"{'name': 'Harper Lee', 'birthday': 'April 28, ...",[better-life-empathy]
96,97,“You have to write the book that wants to be w...,"{'name': 'Madeleine L'Engle', 'birthday': 'Nov...","[books, children, difficult, grown-ups, write,..."
97,98,“Never tell the truth to people who are not wo...,"{'name': 'Mark Twain', 'birthday': 'November 3...",[truth]
98,99,"“A person's a person, no matter how small.”","{'name': 'Dr. Seuss', 'birthday': 'March 02, 1...",[inspirational]


In [19]:
# then we need to manipulate the Dataframe data to look how we want it
#  table withe _id & Text & Author name
quote_df = mongo_df[['_id','quote_text','author']].set_index(['_id'])
quote_df

Unnamed: 0_level_0,quote_text,author
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,“The world as we have created it is a process ...,"{'name': 'Albert Einstein', 'birthday': 'March..."
2,"“It is our choices, Harry, that show what we t...","{'name': 'J.K. Rowling', 'birthday': 'July 31,..."
3,“There are only two ways to live your life. On...,"{'name': 'Albert Einstein', 'birthday': 'March..."
4,"“The person, be it gentleman or lady, who has ...","{'name': 'Jane Austen', 'birthday': 'December ..."
5,"“Imperfection is beauty, madness is genius and...","{'name': 'Marilyn Monroe', 'birthday': 'June 0..."
...,...,...
96,“You never really understand a person until yo...,"{'name': 'Harper Lee', 'birthday': 'April 28, ..."
97,“You have to write the book that wants to be w...,"{'name': 'Madeleine L'Engle', 'birthday': 'Nov..."
98,“Never tell the truth to people who are not wo...,"{'name': 'Mark Twain', 'birthday': 'November 3..."
99,"“A person's a person, no matter how small.”","{'name': 'Dr. Seuss', 'birthday': 'March 02, 1..."


In [26]:
# create author series
author_df = mongo_df[['author']]
author_df

Unnamed: 0,author
0,"{'name': 'Albert Einstein', 'birthday': 'March..."
1,"{'name': 'J.K. Rowling', 'birthday': 'July 31,..."
2,"{'name': 'Albert Einstein', 'birthday': 'March..."
3,"{'name': 'Jane Austen', 'birthday': 'December ..."
4,"{'name': 'Marilyn Monroe', 'birthday': 'June 0..."
...,...
95,"{'name': 'Harper Lee', 'birthday': 'April 28, ..."
96,"{'name': 'Madeleine L'Engle', 'birthday': 'Nov..."
97,"{'name': 'Mark Twain', 'birthday': 'November 3..."
98,"{'name': 'Dr. Seuss', 'birthday': 'March 02, 1..."


In [30]:
# # create author series
# from collections import OrderedDict
# # iterate over the list of MongoDB dict documents
# for author, data in enumerate(author_df):
#         data = OrderedDict(data)
        

ValueError: need more than 1 value to unpack

In [None]:
data

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description