## Daily price data from Diem pot delivery Portland website
Use requests, Beautiful Soup, and Pandas packages to retrieve html data from https://portland.hellodiem.com/, to locate product information, parse title and price into lists, remove superfluous pre-sale values, combine into a Pandas DataFrame, and save to a csv file.

In [1]:
def get_heading_list(url, page):
    # Packages the request, send the request and catch the response: r
    r = requests.get(url+'page/'+str(page)+'/')
    html_text = r.text

    #set soup list equal to html text from page
    soup = BeautifulSoup(html_text, 'html.parser')
    
    headings = [[], []]
    #get list of product title headings on page
    headings[0] = soup.find_all('h2', class_='woocommerce-loop-product__title')
    
    #get list of product price headings on page
    headings[1] = soup.find_all('span', class_='woocommerce-Price-amount amount')
    pop_extra_price(headings[1])
    return headings

def pop_extra_price(headingss):
    """helper function to remove extra pre-sale prices from final list. Input list of 
    headers, get back list of headers less those that had a particular tag"""
    for item in headingss:
        if item.parent.name == 'del':
            headingss.pop(headingss.index(item))


In [2]:
def strip_product(header_list):
    """input a list of tag-type values and return list of strings with surrounding html characters removed"""
    string_list = ['' for item in range(len(header_list))]
    for item in range(len(header_list)):
        string_list[item] = str(header_list[item])[44:-5]
    return string_list

def strip_price(header_list):
    """input a list of tag-type values and return list of strings with surrounding html characters removed"""
    string_list = ['' for item in range(len(header_list))]
    for item in range(len(header_list)):
        string_list[item] = str(header_list[item])[101:-7]
    return string_list

In [3]:
def build_df(price_sers):
    today_date = str(dt.date.today())
    
    combined = pd.concat(price_sers, axis=1)
    #give each column unique name, prices_...
    combined.columns = ['prices_' + str(i) for i in range(1, (len(combined.columns)) + 1)]
    combined.index.name = 'items'
    
    return combined

In [4]:
def check_changes(daily_price_df):
    """
    input: dataframe containing daily price data
    output: dictionary of changed products with the most recent change value"""
    changes = {}
    #check which items have changed price, sold out, or been added
    for price in list(daily_price_df.iterrows()):
        for i in range(len(price)):
            #compare each price with its predecessor to determine last change
            if price[1][i] != price[1][i+1]:
                #append the item name and change to 'changes' dictionary
                changes[price[0]] = (price[1][i+1] - price[1][i])    
                
    return changes

In [5]:
from bs4 import BeautifulSoup
import requests
max_page = 12
url = 'https://portland.hellodiem.com/'

In [6]:
#create a list to hold lists of headers, one for each page
h_product = ['' for strng in range(max_page)]
h_price = ['' for strng in range(max_page)]

#iterate over each list of headers for a page, set each index of headers equal to list
for page in range(0, max_page):
    headers = get_heading_list(url, page+1)
    h_product[page] = headers[0]
    h_price[page] = headers[1]
    
combined_products = []
combined_prices = []

for sublist in range(max_page):
    combined_products.append(strip_product(h_product[sublist]))
    
for sublist in range(max_page):
    combined_prices.append(strip_price(h_price[sublist]))

#create a list of all prices
all_prices = []
for by_page in combined_prices:
    for li in by_page:
        #print(li, '\n')
        all_prices.append(float(li))
        
#create a list of all products
all_products = []
for by_page in combined_products:
    for li in by_page:
        #print(li, '\n')
        all_products.append(li)

In [7]:
import pandas as pd
product_ser = pd.Series(all_products)
price_ser = pd.Series(all_prices)

pot = pd.DataFrame([product_ser, price_ser]).T

In [8]:
#import datetime, needed to get current date
import datetime as dt
#set today_date, formatted as YYYY-MM-DD
today_date = str(dt.date.today())
#print(today_date)


In [9]:
#write to csv and set index to position rather than name, use
pot.to_csv(('prices'+ today_date +'.csv'), encoding='utf8', 
           header=['item_name', 'item_price'], index_label='relative_position')

## load from file

In [10]:
import pandas as pd

In [11]:
file_list = ['prices2018-05-08.csv', 
             'prices2018-05-09.csv', 
             'prices2018-05-10.csv', 
             'prices2018-05-11.csv', 
             'prices2018-05-12.csv', 
             'prices2018-05-13.csv']

In [12]:
#------------------------------------------------------------------------------


"""
#did not explicitly specify utf8 encoding when creating first file, and now need 
#to specify ansi when opening
pot1 = pd.read_csv('prices2018-05-07.csv', encoding='ansi')
pot1.set_index(pot1['0'], inplace=True)
pot1 = pot1[["1"]]
pot1.columns = ['item_price']
pot1.index.name = 'item_name'
"""

#read files into dataframes to prepare for combination
pots = [pd.read_csv(file, index_col='item_name') for file in file_list]
pots = [pots[i][['item_price']] for i in range(len(pots))]

#combine into new df
pot_all = build_df(pots)

pot_all.head(20)

Unnamed: 0_level_0,prices_1,prices_2,prices_3,prices_4,prices_5,prices_6
items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1:1 Cannabis RSO,40.0,40.0,40.0,40.0,40.0,40.0
9lb Hammer Shatter,19.0,19.0,19.0,19.0,19.0,19.0
9lb Hammer Shatter 5-pack ($15 OFF),80.0,80.0,80.0,80.0,80.0,80.0
ATF Oil Cartridge 1 gram,65.0,65.0,40.0,40.0,65.0,65.0
Amnesia Pre-roll 1 gram,9.0,9.0,9.0,9.0,9.0,9.0
BIC Lighter,1.75,1.75,1.75,,,
Berry Larry,10.0,10.0,10.0,10.0,10.0,10.0
Big Bud Skunk CO2 Oil,22.0,22.0,22.0,22.0,22.0,22.0
Big Bud Skunk CO2 Oil 3-pack ($12 OFF),54.0,54.0,54.0,54.0,65.0,65.0
Blackberry Gummy,5.0,5.0,5.0,5.0,5.0,5.0


In [13]:
#quick stats
print(pot_all.describe(), '\n')

#print range least to most expensive item and price
print(pot_all.idxmin()[0], ('$' + str(pot_all.loc[pot_all.idxmin()[0]][0])))
print(pot_all.idxmax()[0], ('$' + str(pot_all.loc[pot_all.idxmax()[0]][0])))


         prices_1   prices_2    prices_3    prices_4    prices_5    prices_6
count  210.000000  211.00000  215.000000  216.000000  216.000000  216.000000
mean    38.416667   38.86019   39.509302   40.197917   40.072917   40.072917
std     33.652753   33.74572   34.619414   34.515690   34.461497   34.461497
min      1.750000    1.75000    1.750000    3.500000    3.500000    3.500000
25%     13.000000   13.00000   13.000000   13.000000   13.000000   13.000000
50%     27.000000   28.00000   28.000000   28.000000   28.000000   28.000000
75%     54.000000   57.50000   59.500000   60.000000   60.000000   60.000000
max    200.000000  200.00000  200.000000  200.000000  200.000000  200.000000 

BIC Lighter $1.75
DISCOUNT OUNCE ~ Mendo Breath ~ ($164 OFF) $200.0


In [14]:
last_changes = (check_changes(pot_all))
for item in last_changes:
    print(item, ": ", last_changes[item])

ATF Oil Cartridge 1 gram :  -25.0
Blue Dragon Oil Cartridge 1/2 gram :  7.0
Blue Mystic Oil Cartridge 1 gram :  2.0
Bubba Kush Oil Cartridge 1 gram :  nan
Bubba Kush Vape Pen :  nan
Candyland Shatter :  -3.0
Candyland Shatter 3-pack ($10 OFF) :  35.0
DISCOUNT HALF OZ ~ GS Cookies ~ ($50 OFF) :  nan
DISCOUNT HALF OZ ~ Ringo’s Gift ~ ($30 OFF) :  nan
Durban Poison Pre-roll 1 gram :  nan
Fruit Punch Oil Cartridge 1/2 gram :  -9.0
GS Cookies :  nan
Green Quack Pre-roll 20-pack ($60 OFF) :  nan
Green Quack Pre-roll 5-pack ($8 OFF) :  nan
Jilly Bean Live Resin Cartridge 1/2 gram :  2.0
Kimbo Kush Shatter 3-pack ($30 OFF) :  nan
Lemon Skunk Shatter :  -31.0
Marionberry Indica Gummies 3-pack ($8 OFF) :  nan
Master Kush Oil Cartridge 1 gram :  5.0
Master Yoda Shatter :  -33.0
Master Yoda Shatter 3-pack ($9 OFF) :  32.0
OG Cheese Oil Cartridge 1 gram :  7.0
Obama Kush Oil Cartridge 1 gram :  63.0
Obama Kush Oil Cartridge 3-pack ($21 OFF) :  nan
Pearls Sativa :  nan
Ringo’s Gift :  nan
Snowland S