In [2]:
# the Python Requests package will allow us to send HTTP requests to get HTML files
import requests

# the GET method indicates that you’re trying to get or retrieve data from a specified resource. 
# to make a GET request, invoke requests.get()
from requests import get

# Beautiful Soup is a Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup

# pandas is a Python data analysis library
import pandas as pd

# NumPy is a Python library used for working with large, multi-dimensional arrays and matrices
import numpy as np

# Datetime is a module that supplies classes for manipulating dates and times
from datetime import datetime

# The os module provides a portable way of using operating system dependent functionality
import os

# re can be used to work with Regular Expressions
import re

# The os.path module implements some useful functions on pathnames
import os.path as path

In [3]:
# look in working directory for CSV file where existing data is stored and read as dataframe
# if this file does not exist, create empty dataframe with columns corresponding to required information
if path.exists('rosebudremedy.csv'):
   existing_data = pd.read_csv('rosebudremedy.csv')
else:
   columns = ['date', 'product','price']
   existing_data = pd.DataFrame(columns = columns)

In [None]:
existing_data

As of April 2022, the Canada Cannabis Dispensary website (where Rosebud Remedy prices are displayed) has been taken down since March 2022, and is currently not accessible by this URL. This block of code will throw a connection error unless the website returns, but should be used if the website returns. If a new website returns under a different URL, the URL should be updated.

In [4]:
# the URLs we want to obtain data from
url = 'https://www.canadacannabisdispensary.co/product-category/rosebud-remedy/'

# store scraped data in lists
all_names = []
all_prices = []
date = []

results = requests.get(url)

# parse web content into Python-readable format
soup = BeautifulSoup(results.content, "html.parser")

products = soup.find_all('a', class_="woocommerce-LoopProduct-link woocommerce-loop-product__link")

for product in products:  
    # obtain where the name of each product is stored
    name = product.find('h2', class_='woocommerce-loop-product__title').text
    
    # obtain where the price of each product is stored
    # quantity is not relevant for this type of product
    price = product.find('span', class_='woocommerce-Price-amount amount').text

    # obtain the date the script is being run
    current_date = datetime.today().strftime('%Y-%m-%d')

    all_names.append(name)
    all_prices.append(price)
    date.append(current_date)

    print(name)
    print(price)
    print(current_date)


ConnectionError: HTTPSConnectionPool(host='www.canadacannabisdispensary.co', port=443): Max retries exceeded with url: /product-category/rosebud-remedy/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fe34f1e0280>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [5]:
# create dataframe using new data
new_data = pd.DataFrame(
    {'date': date,
     'product': all_names,
     'price': all_prices,
    })

In [6]:
new_data

Unnamed: 0,date,product,price


In [7]:
# join original and new data by column
joined_data = pd.concat([existing_data, new_data])

In [None]:
joined_data

In [None]:
# ensure duplicates are removed in case data is added twice
final_data = joined_data.drop_duplicates(subset=['date', 'product'], keep = 'last', inplace=False)

In [None]:
final_data

In [9]:
# define filename for data
output_path = 'rosebudremedy.csv'

# overwrite previous file with new file containing previous and new data
joined_data.to_csv(output_path, index = False)