# Scraping
* getting html 
* parsing html

In [9]:
# Import our libraries
import json
import requests

import pandas as pd

from bs4 import BeautifulSoup
from pymongo import MongoClient

In [2]:
# Scrape an item

# step 1 get a url and it's params


url = "https://www.ebay.com/sch/i.html"
params = {"_nkw": "noise cancelling headphones bluetooth wireless"}

# Step 1 - Get HTML

In [3]:
# make a request and start scraping
r = requests.get(url=url, params=params)

In [4]:
r.status_code

200

In [8]:
# r.content is just the html file of the webpage
r.content[:3000]

b'<!DOCTYPE html><!--[if IE 9]><html class="ie9 srp-ds6" lang="en"><![endif]--><!--[if gt IE 9]><!--><html class="srp-ds6" lang="en"><!--<![endif]--><head><meta http-equiv="X-UA-Compatible" content="IE=edge"><script>"use strict";if(window.PerformanceObserver&&performance&&performance.mark&&performance.getEntriesByName){window.SRP=window.SRP||{};var paintObserver=new window.PerformanceObserver(function(e){var r=e.getEntries();r.sort(function(e,r){return e.startTime-r.startTime});var n=r[1].startTime;window.SRP.TTI_TIMER={lastInteractiveWindow:n};var t=new window.PerformanceObserver(function(e){for(var r=e.getEntries(),i=0,a=r.length;i<a;i++)r[i].startTime-n>=5e3&&(window.SRP.TTI_TIMER.timeToInteract=n,t.disconnect()),n=r[i].startTime+r[i].duration,window.SRP.TTI_TIMER.lastInteractiveWindow=n});t.observe({entryTypes:["longtask"]}),paintObserver.disconnect()});paintObserver.observe({entryTypes:["paint"]})};</script><link rel="preconnect" href="https://ir.ebaystatic.com" /><meta name="robo

# Step 2 - Parse HTML

In [24]:
webpage_soup = BeautifulSoup(r.content, 'html.parser')

### .find vs .find_all
* .find -> finds the first instance -> return a soup object
* .find_all -> finds all instances -> returns a list of soup objects

### what is a soup object?
* like a cursor that lets you query into a string of html

In [14]:
results_list = webpage_soup.find_all("div", attrs={"class":"srp-river-results"})
results = results_list[0]

In [27]:
result_boxes = results.find_all("div", attrs={"class": "s-item__wrapper"})
len(result_boxes)

63

In [30]:
first_box = result_boxes[0]
name = first_box.find("h3", attrs={"class":"s-item__title"}).text
price = first_box.find("span", attrs={"class":"s-item__price"}).text

('Waterproof Bluetooth 5.0 Earbuds Headphones Wireless Headset Noise Cancelling',
 '$13.99 to $35.99')

# let's do this for all the boxes now! 

In [43]:
def get_list_of_data(result_boxes):
    list_of_data = []
    for box in result_boxes:
        data = {}

        data['name'] = box.find("h3", attrs={"class":"s-item__title"}).text
        data['price'] = box.find("span", attrs={"class":"s-item__price"}).text
        list_of_data.append(data)

    return list_of_data

In [44]:
pd.DataFrame(get_list_of_data(result_boxes))

Unnamed: 0,name,price
0,Waterproof Bluetooth 5.0 Earbuds Headphones Wi...,$13.99 to $35.99
1,A6S Wireless Earbuds Bluetooth 5.0 Headphones ...,$20.99
2,Bluetooth Wireless Headphones Bluedio T5S Nois...,$26.00
3,Waterproof Bluetooth 5.0 Earbuds Headphones Wi...,$23.39
4,2020 Wireless Earbuds Bluetooth 5.0 Headphones...,$18.60
...,...,...
58,AUSDOM ANC8 Active Noise Cancelling Bluetooth ...,$43.00
59,Cowin E7 [2018 Upgraded] Noise Cancelling Head...,$47.99
60,Bluetooth Noise Cancelling Headphones Over Ear...,$18.99
61,Bluedio T5S Bluetooth V4.2 Headphones Wireless...,$26.00


In [47]:
def get_result_boxes(webpage_soup):
    results_list = webpage_soup.find_all("div", attrs={"class":"srp-river-results"})[0]
    result_boxes = results_list.find_all("div", attrs={"class": "s-item__wrapper"})
    return result_boxes

In [48]:
# let's do this for multiple pages
# Scrape an item

# step 1 get a url and it's params

all_data = []
url = "https://www.ebay.com/sch/i.html"
for page_number in range(1, 6):
    params = {"_nkw": "noise cancelling headphones bluetooth wireless", "&_pgn": page_number}
    r = requests.get(url=url, params=params)
    webpage_soup = BeautifulSoup(r.content, 'html.parser')
    result_boxes = get_result_boxes(webpage_soup)
    page_list_of_data = get_list_of_data(result_boxes)
    all_data.extend(page_list_of_data)

In [49]:
len(all_data)

315

In [50]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,name,price
0,Waterproof Bluetooth 5.0 Earbuds Headphones Wi...,$13.99 to $35.99
1,A6S Wireless Earbuds Bluetooth 5.0 Headphones ...,$20.99
2,Waterproof Bluetooth 5.0 Earbuds Headphones Wi...,$23.39
3,2020 Wireless Earbuds Bluetooth 5.0 Headphones...,$18.60
4,Bluetooth Wireless Headphones Bluedio T5S Nois...,$26.00


# We want to store our data
Options
* MongoDB
* SQL
* xlsx
* csv
* tsv
* json

## Let's store in various ways

In [52]:
# csv
df.to_csv("ebay_earbuds.csv", index=False)

In [53]:
# tsv
df.to_csv("ebay_earbuds.tsv", index=False, sep='\t')

In [54]:
# text
df.to_csv("ebay_earbuds.txt", index=False)

In [58]:
# json
df.to_json("ebay_earbuds.json", lines=True, orient='records')

In [59]:
import sqlite3

In [60]:
conn = sqlite3.connect('ebay_earbuds.db')

In [61]:
df.to_sql("earbuds", conn, schema=None, if_exists='fail', 
          index=True, index_label=None, chunksize=None, dtype=None)


In [62]:
conn.execute("select name from sqlite_master where type='table';").fetchall()

[('earbuds',)]

# Storing in a MongoDB

In [63]:
client = MongoClient(host='localhost', port=27017)

In [64]:
client.list_database_names()

['AprFT',
 'admin',
 'config',
 'ebay',
 'headstorm_client_db',
 'local',
 'music_tweets',
 'new_db',
 'tweets']

In [65]:
ebay = client['ebay']

In [66]:
ebay.list_collection_names()

['collection_one', 'tech_bags']

In [67]:
earbuds = ebay['earbuds']

In [68]:
earbuds.insert_many(all_data)

<pymongo.results.InsertManyResult at 0x1242c3848>

In [69]:
ebay.list_collection_names()

['earbuds', 'collection_one', 'tech_bags']