In [12]:
# the Python Requests package will allow us to send HTTP requests to get HTML files
import requests

# the GET method indicates that you’re trying to get or retrieve data from a specified resource. 
# to make a GET request, invoke requests.get()
from requests import get

# Beautiful Soup is a Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup

# pandas is a Python data analysis library
import pandas as pd

# NumPy is a Python library used for working with large, multi-dimensional arrays and matrices
import numpy as np

# Datetime is a module that supplies classes for manipulating dates and times
from datetime import datetime

# The os module provides a portable way of using operating system dependent functionality
import os

# The os.path module implements some useful functions on pathnames
import os.path as path

In [15]:
# look in working directory for CSV file where existing data is stored and read as dataframe
# if this file does not exist, create empty dataframe with columns corresponding to required information
if path.exists('speedgreens.csv'):
   existing_data = pd.read_csv('speedgreens.csv')
else:
   columns = ['date', 'product','category', 'price', 'quantity']
   existing_data = pd.DataFrame(columns = columns)

In [16]:
existing_data

Unnamed: 0,date,product,price,category,quantity
0,2022-04-14,\nM.T.V Hash Weed,$22.00,concentrates/hash/,1g
1,2022-04-14,\nRed Light Windmill Hash From Amsterdam- AAAA...,$22.00,concentrates/hash/,1g
2,2022-04-14,\nRomeo and Juliet Hash,$22.00,concentrates/hash/,1g
3,2022-04-14,\nTwo Sword Hash (AA) – Hash Plant,$8.00,concentrates/hash/,1gm
4,2022-04-14,\nRockstar Tuna Bubble Hash (AAAA),$22.00,concentrates/hash/,1gm
5,2022-04-14,\nPink Gas Premium Craft Bubble Hash (AAAA) – ...,$22.00,concentrates/hash/,1gm
6,2022-04-14,\nTesla Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
7,2022-04-14,\nDiamond Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
8,2022-04-14,\nZigzag Hash (AAA) – Hashish,$14.00,concentrates/hash/,1gm
9,2022-04-14,\nAfghan Mazar Sharif Hash (AAAA) – Hashish,$22.00,concentrates/hash/,1gm


In [17]:
# the URLs we want to obtain data from
url = 'https://speedgreens.co/product-category/'
keys = ['concentrates/hash/', 'cannabis/sativa/']

# store scraped data in lists
all_names = []
all_prices = []
all_quantities = []
all_categories = []
date = []

# iterate through each URL
for key in keys:
    results = requests.get(url + key)
    
    # parse web content into Python-readable format
    soup = BeautifulSoup(results.content, "html.parser")
    
    # from inspecting the HTML, information we need is stored here for each product
    # the find_all() function will find every instance of matching tags and filters from the soup
    products = soup.find_all('div', class_="product-content")
    
    # iterate through each product and obtain data from each
    for product in products:
        # obtain where the name of each product is stored
        product_link = product.find('a', class_= 'product-loop-title')
        name = product_link.text
        
        # obtain where the URL of each individual product page is stored
        product_page = product_link.get('href')
        
        # obtain HTML for individual product pages
        # individual product pages contain price and quantity information
        product_page_results = requests.get(product_page)
        product_page_soup = BeautifulSoup(product_page_results.content, "html.parser")
        
        # speedgreens includes out-of-stock items in their regular listings. these are unnecessary
        if product_page_soup.find('p', class_='stock out-of-stock'):
            pass
        else:     
            # obtain where the price of each product is stored
            price = product_page_soup.find('span', class_='woocommerce-Price-amount amount').text
            
            # obtain where the corresponding quantity of each product is stored
            # this is necessary for calculating a price per quantity for each product if desired
            quantity = product_page_soup.find('a', class_='filter-item').text

            # obtain the date the script is being run
            current_date = datetime.today().strftime('%Y-%m-%d')

            all_names.append(name)
            all_prices.append(price)
            all_quantities.append(quantity)
            all_categories.append(key)
            date.append(current_date)

            print(name)
            print(product_page)
            print('Price: ' + price)
            print('Unit: ' + quantity)
            print(current_date)



M.T.V Hash Weed 
https://speedgreens.co/product/mtv-hash-aaaa/
Price: $22.00
Unit: 1g
2022-05-09

Red Light Windmill Hash From Amsterdam- AAAA Premium Hashish 
https://speedgreens.co/product/red-light-windmill-hash-from-amsterdam-aaaa/
Price: $22.00
Unit: 1g
2022-05-09

Romeo and Juliet Hash 
https://speedgreens.co/product/romeo-and-juliet-hash-aaaa/
Price: $22.00
Unit: 1g
2022-05-09

Two Sword Hash (AA) – Hash Plant 
https://speedgreens.co/product/two-sword-hash-aa/
Price: $8.00
Unit: 1gm
2022-05-09

Rockstar Tuna Bubble Hash (AAAA) 
https://speedgreens.co/product/tuna-rockstar-craft-hash-aaaa/
Price: $22.00
Unit: 1gm
2022-05-09

Pink Gas Premium Craft Bubble Hash (AAAA) – Hash Weed 
https://speedgreens.co/product/pink-gas-craft-hash-aaaa/
Price: $22.00
Unit: 1gm
2022-05-09

Tesla Hash (AAA) – Hash Weed 
https://speedgreens.co/product/tesla-hash-aaa/
Price: $14.00
Unit: 1gm
2022-05-09

Diamond Hash (AAA) – Hash Weed 
https://speedgreens.co/product/diamond-hash/
Price: $14.00
Unit: 1g

In [18]:
# create dataframe using new data
new_data = pd.DataFrame(
    {'date': date,
     'product': all_names,
     'category': all_categories,
     'price': all_prices,
     'quantity': all_quantities
    })

In [19]:
new_data

Unnamed: 0,date,product,category,price,quantity
0,2022-05-09,\nM.T.V Hash Weed,concentrates/hash/,$22.00,1g
1,2022-05-09,\nRed Light Windmill Hash From Amsterdam- AAAA...,concentrates/hash/,$22.00,1g
2,2022-05-09,\nRomeo and Juliet Hash,concentrates/hash/,$22.00,1g
3,2022-05-09,\nTwo Sword Hash (AA) – Hash Plant,concentrates/hash/,$8.00,1gm
4,2022-05-09,\nRockstar Tuna Bubble Hash (AAAA),concentrates/hash/,$22.00,1gm
5,2022-05-09,\nPink Gas Premium Craft Bubble Hash (AAAA) – ...,concentrates/hash/,$22.00,1gm
6,2022-05-09,\nTesla Hash (AAA) – Hash Weed,concentrates/hash/,$14.00,1gm
7,2022-05-09,\nDiamond Hash (AAA) – Hash Weed,concentrates/hash/,$14.00,1gm
8,2022-05-09,\nZigzag Hash (AAA) – Hashish,concentrates/hash/,$14.00,1gm
9,2022-05-09,\nAfghan Mazar Sharif Hash (AAAA) – Hashish,concentrates/hash/,$22.00,1gm


In [20]:
# join original and new data by column
joined_data = pd.concat([existing_data, new_data])

In [21]:
joined_data

Unnamed: 0,date,product,price,category,quantity
0,2022-04-14,\nM.T.V Hash Weed,$22.00,concentrates/hash/,1g
1,2022-04-14,\nRed Light Windmill Hash From Amsterdam- AAAA...,$22.00,concentrates/hash/,1g
2,2022-04-14,\nRomeo and Juliet Hash,$22.00,concentrates/hash/,1g
3,2022-04-14,\nTwo Sword Hash (AA) – Hash Plant,$8.00,concentrates/hash/,1gm
4,2022-04-14,\nRockstar Tuna Bubble Hash (AAAA),$22.00,concentrates/hash/,1gm
5,2022-04-14,\nPink Gas Premium Craft Bubble Hash (AAAA) – ...,$22.00,concentrates/hash/,1gm
6,2022-04-14,\nTesla Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
7,2022-04-14,\nDiamond Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
8,2022-04-14,\nZigzag Hash (AAA) – Hashish,$14.00,concentrates/hash/,1gm
9,2022-04-14,\nAfghan Mazar Sharif Hash (AAAA) – Hashish,$22.00,concentrates/hash/,1gm


In [22]:
# ensure duplicates are removed in case data is added twice
final_data = joined_data.drop_duplicates(subset=['date', 'product'], keep = 'last', inplace=False)

In [23]:
final_data

Unnamed: 0,date,product,price,category,quantity
0,2022-04-14,\nM.T.V Hash Weed,$22.00,concentrates/hash/,1g
1,2022-04-14,\nRed Light Windmill Hash From Amsterdam- AAAA...,$22.00,concentrates/hash/,1g
2,2022-04-14,\nRomeo and Juliet Hash,$22.00,concentrates/hash/,1g
3,2022-04-14,\nTwo Sword Hash (AA) – Hash Plant,$8.00,concentrates/hash/,1gm
4,2022-04-14,\nRockstar Tuna Bubble Hash (AAAA),$22.00,concentrates/hash/,1gm
5,2022-04-14,\nPink Gas Premium Craft Bubble Hash (AAAA) – ...,$22.00,concentrates/hash/,1gm
6,2022-04-14,\nTesla Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
7,2022-04-14,\nDiamond Hash (AAA) – Hash Weed,$14.00,concentrates/hash/,1gm
8,2022-04-14,\nZigzag Hash (AAA) – Hashish,$14.00,concentrates/hash/,1gm
9,2022-04-14,\nAfghan Mazar Sharif Hash (AAAA) – Hashish,$22.00,concentrates/hash/,1gm


In [24]:
# define filename for data
output_path = 'speedgreens.csv'

# overwrite previous file with new file containing previous and new data
joined_data.to_csv(output_path, index = False)