In [None]:
from splinter import Browser
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pymongo
import pandas as pd
import requests
from sqlalchemy import create_engine

In [None]:
# open browser
browser = Browser('chrome')
url = 'http://quotes.toscrape.com/'

In [None]:
# get author born, 
def quote_author(url):
    author_info = {}
    
    # request author html data
    authorPage = requests.get(url).text
    
    # parse response
    author = BeautifulSoup(authorPage, 'html.parser')
    
    # find author details
    author_info['name'] = author.find('h3', class_ = 'author-title').text
    author_info['birthday'] = author.find('span', class_ = 'author-born-date').text
    author_info['location'] = author.find('span', class_ = 'author-born-location').text
    author_info['description'] = author.find('div', class_ = 'author-description').text.strip()
    
    return author_info

In [None]:
# get tags
def get_tags(quote):
    tagList = []
    
    quoteTags = quote.find_all('a', class_='tag')
    
    for tag in quoteTags:
        tagList.append(tag.text)
    
    return tagList

In [None]:
# get quote info
def quote_data(quote):
    quote_info = {}
    quote_info['quote_text'] = quote.find('span', class_='text').text
    auth_url = urljoin(url, quote.find('a')['href'])
    
    # get author data
    quote_info['author'] = quote_author(auth_url)
    
    # get tag data
    quote_info['tag'] = get_tags(quote)
    
    return quote_info

In [None]:
# get list of quote boxes
def get_quoteList(pageNumber):
    quoteList = []
    
    # get html data from current page
    html = browser.html
    
    # use BeautifulSoup to parse html data
    soup = BeautifulSoup(html, 'html.parser')
    
    # find all quote blocks
    quotes = soup.find_all('div', class_='quote')
    
    # initialize quote ids
    quoteID = (pageNumber - 1) * 10
    
    # for each quote in the list retuned
    for quote in quotes:
        # increment quote id
        quoteID += 1
        
        # get all quote data returned as a dictionary
        quote_dic = quote_data(quote)
        quote_dic['_id'] = quoteID
        
        quoteList.append(quote_dic)
    
#         quoteList[0]: {_id: 1,
#                         quote_text: ,
#                         author:{name:
#                                 birthday:      
#                                 born:
#                                 description:}
#                         tags: []
#                       }
    
    return quoteList

In [None]:
# scrape everything
def scrap_everything(url):
    all_quotes = []
    
    first_iterations = True
    nextPage = True
    pageNumber = 1
    
    while nextPage == True:
        
        # if this is the first time through the while loop, navigate to url 
        if first_iterations == True:
            browser.visit(url)
            first_iterations = False
        else:
            pass
        
        # get all quote data from the page
        print(f'Scraping Page {pageNumber}')
        currentList = get_quoteList(pageNumber)
        all_quotes =  all_quotes + currentList
        
        try:
            browser.links.find_by_partial_text('Next').click()
            pageNumber += 1  
        except:
            print("Scraping Complete")
            nextPage = False
    
    return all_quotes
    

In [None]:
data = scrap_everything(url)

In [None]:
browser.quit()

### Send data to MongoDB

In [None]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [None]:
# Define database and collection
db = client.quoteslist_db
collection = db.items

In [None]:
# Dictionary to be inserted as a MongoDB document
collection.insert_many(data)


### Move data from  MongoDB to put into postgres

In [None]:
# mongo_data = db.mongo_data
mongo_df = pd.DataFrame(list(collection.find()))
mongo_df

In [None]:
# then we need to manipulate the Dataframe data to look how we want it
#  table withe _id & Text & Author name
quote_df = mongo_df[['_id','quote_text','author']].set_index(['_id'])
quote_df

In [None]:
# create author series
author_df = mongo_df[['author']]
author_df

In [None]:
# # create author series
# from collections import OrderedDict
# # iterate over the list of MongoDB dict documents
# for author, data in enumerate(author_df):
#         data = OrderedDict(data)
        

In [None]:
data

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description

In [None]:
#  table withe _id & Text & Author name
# table with _id and Tag
# tabel with author name, data, location, description