In [13]:
# the Python Requests package will allow us to send HTTP requests to get HTML files
import requests

# the GET method indicates that you’re trying to get or retrieve data from a specified resource. 
# to make a GET request, invoke requests.get()
from requests import get

# Beautiful Soup is a Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup

# pandas is a Python data analysis library
import pandas as pd

# NumPy is a Python library used for working with large, multi-dimensional arrays and matrices
import numpy as np

# Datetime is a module that supplies classes for manipulating dates and times
from datetime import datetime

# The os module provides a portable way of using operating system dependent functionality
import os

# re can be used to work with Regular Expressions
import re

# The os.path module implements some useful functions on pathnames
import os.path as path

In [14]:
# look in working directory for CSV file where existing data is stored and read as dataframe
# if this file does not exist, create empty dataframe with columns corresponding to required information
if path.exists('canadacannabisdispensary.csv'):
   existing_data = pd.read_csv('canadacannabisdispensary.csv')
else:
   columns = ['date', 'product','price', 'quantity']
   existing_data = pd.DataFrame(columns = columns)

In [15]:
existing_data

Unnamed: 0,date,product,price,quantity
0,2022-03-10,9 Pound Hammer (AAA),$26.00,3.5
1,2022-03-10,Alaskan Thunder Fuck (AAA),$28.00,3.5
2,2022-03-10,Blueberry (AAA),$23.00,3.5
3,2022-03-10,Cherry Cough (AAA),$19.00,3.5
4,2022-03-10,Donkey Butter (AAA),$26.00,3.5
5,2022-03-10,Godzilla Glue (AAA),$22.00,3.5
6,2022-03-10,Humble Pie (AAA),$28.00,3.5
7,2022-03-10,Meat Breath (AAA),$23.00,3.5
8,2022-03-10,MK Ultra (AAA),$22.00,3.5
9,2022-03-10,Pie Face (AAA),$23.00,3.5


This block of code will throw a connection error if the illegal website has been taken down, but should be used if the website returns. If a new website returns under a different URL, the URL should be updated.

In [None]:
# the URL we want to obtain data from
url = 'https://www.canadacannabisdispensary.ca/product-category/aaa/'

# store scraped data in lists
all_names = []
all_prices = []
all_quantities = []
date = []

results = requests.get(url)

# parse web content into Python-readable format
soup = BeautifulSoup(results.content, "html.parser")

# from inspecting the HTML, information we need is stored here for each product
# the find_all() function will find every instance of matching tags and filters from the soup
products = soup.find_all('a', class_="woocommerce-LoopProduct-link woocommerce-loop-product__link")

# iterate through each product and obtain data from each
for product in products:  
    # obtain where the name of each product is stored
    name = product.find('h2', class_='woocommerce-loop-product__title').text
    
    # obtain where the URL of each individual product page is stored
    product_page = product.get('href')
    
    # obtain HTML for individual product pages
    # individual product pages contain price and quantity information
    product_page_results = requests.get(product_page)
    product_page_soup = BeautifulSoup(product_page_results.content, "html.parser")
    
    # obtain where the price of each product is stored
    price = product_page_soup.find('span', class_='woocommerce-Price-amount amount').text
    
    # obtaining the correct corresponding quantity for a given price requires sorting through all quantity options
    # this is necessary for calculating a price per quantity for each product if desired
    quantities = product_page_soup.find_all('option')
    
    # list to store all quantity options available for a given product
    stripped_quantities = [] 
    
    # if quantity options exist, sort through each option and add to list of options
    if quantities:
        for quantity in quantities:
            # find all number quantity options listed on product page by removing non-numeric characters from each
            strip = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", quantity.text)
            if strip:
                stripped_quantities.append(float(strip[0]))
    
    # if stripped_quantities is non-empty, add the information to the quantity column
    if stripped_quantities:
        # the price we need corresponds to the smallest quantity available for sale
        all_quantities.append(min(stripped_quantities))
    else:
        all_quantities.append('No quantity options')

    # obtain the date the script is being run
    current_date = datetime.today().strftime('%Y-%m-%d')

    all_names.append(name)
    all_prices.append(price)
    date.append(current_date)

In [8]:
print(all_names)
print(all_prices)
print(all_quantities)
print(date)

['9 Pound Hammer (AAA)', 'Cherry Cough (AAA)', 'Godzilla Glue (AAA)', 'Humble Pie (AAA)', 'Meat Breath (AAA)', 'MK Ultra (AAA)', 'Pie Face (AAA)', 'Platinum Girl Scout Cookies (AAA)', 'Pot of Gold (AAA)', 'Space Grape (AAA)', 'Tom Ford Pink Kush (AAA)']
['$26.00', '$19.00', '$22.00', '$28.00', '$23.00', '$22.00', '$23.00', '$29.00', '$23.00', '$24.00', '$29.00']
[3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5]
['2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09', '2022-05-09']


In [9]:
# create dataframe using new data
new_data = pd.DataFrame(
    {'date': date,
     'product': all_names,
     'price': all_prices,
     'quantity': all_quantities
    })

In [10]:
new_data

Unnamed: 0,date,product,price,quantity
0,2022-05-09,9 Pound Hammer (AAA),$26.00,3.5
1,2022-05-09,Cherry Cough (AAA),$19.00,3.5
2,2022-05-09,Godzilla Glue (AAA),$22.00,3.5
3,2022-05-09,Humble Pie (AAA),$28.00,3.5
4,2022-05-09,Meat Breath (AAA),$23.00,3.5
5,2022-05-09,MK Ultra (AAA),$22.00,3.5
6,2022-05-09,Pie Face (AAA),$23.00,3.5
7,2022-05-09,Platinum Girl Scout Cookies (AAA),$29.00,3.5
8,2022-05-09,Pot of Gold (AAA),$23.00,3.5
9,2022-05-09,Space Grape (AAA),$24.00,3.5


In [11]:
# join original and new data by column
joined_data = pd.concat([existing_data, new_data])

In [12]:
joined_data

Unnamed: 0,date,product,price,quantity
0,3/2022,9 Pound Hammer (AAA),$26.00,3.5
1,3/2022,Alaskan Thunder Fuck (AAA),$28.00,3.5
2,3/2022,Blueberry (AAA),$23.00,3.5
3,3/2022,Cherry Cough (AAA),$19.00,3.5
4,3/2022,Donkey Butter (AAA),$26.00,3.5
5,3/2022,Godzilla Glue (AAA),$22.00,3.5
6,3/2022,Humble Pie (AAA),$28.00,3.5
7,3/2022,Meat Breath (AAA),$23.00,3.5
8,3/2022,MK Ultra (AAA),$22.00,3.5
9,3/2022,Pie Face (AAA),$23.00,3.5


In [None]:
# ensure duplicates are removed in case data is added twice
final_data = joined_data.drop_duplicates(subset=['date', 'product'], keep = 'last', inplace=False)

In [None]:
final_data

In [None]:
# define filename for data
output_path = 'canadacannabisdispensary.csv'

# overwrite previous file with new file containing previous and new data
df.to_csv(output_path, index = False)