## Get price data from specific website
Use requests, Beautiful Soup, and Pandas packages to retrieve html data from https://portland.hellodiem.com/, to locate product information, parse title and price into lists, remove superfluous pre-sale values, combine into a Pandas DataFrame, and save to a csv file.

In [1]:
def get_heading_list(url, page):
    # Packages the request, send the request and catch the response: r
    r = requests.get(url+'page/'+str(page)+'/')
    html_text = r.text

    #set soup list equal to html text from page
    soup = BeautifulSoup(html_text, 'html.parser')
    
    headings = [[], []]
    #get list of product title headings on page
    headings[0] = soup.find_all('h2', class_='woocommerce-loop-product__title')
    
    #get list of product price headings on page
    headings[1] = soup.find_all('span', class_='woocommerce-Price-amount amount')
    pop_extra_price(headings[1])
    return headings

def pop_extra_price(headingss):
    """helper function to remove extra pre-sale prices from final list. Input list of 
    headers, get back list of headers less those that had a particular tag"""
    for item in headingss:
        if item.parent.name == 'del':
            headingss.pop(headingss.index(item))


In [2]:
def strip_product(header_list):
    """input a list of tag-type values and return list of strings with surrounding html characters removed"""
    string_list = ['' for item in range(len(header_list))]
    for item in range(len(header_list)):
        string_list[item] = str(header_list[item])[44:-5]
    return string_list

def strip_price(header_list):
    """input a list of tag-type values and return list of strings with surrounding html characters removed"""
    string_list = ['' for item in range(len(header_list))]
    for item in range(len(header_list)):
        string_list[item] = str(header_list[item])[101:-7]
    return string_list

In [3]:
from bs4 import BeautifulSoup
import requests
max_page = 12
url = 'https://portland.hellodiem.com/'

In [4]:
#create a list to hold lists of headers, one for each page
h_product = ['' for strng in range(max_page)]
h_price = ['' for strng in range(max_page)]

#iterate over each list of headers for a page, set each index of headers equal to list
for page in range(0, max_page):
    headers = get_heading_list(url, page+1)
    h_product[page] = headers[0]
    h_price[page] = headers[1]
    
combined_products = []
combined_prices = []

for sublist in range(max_page):
    combined_products.append(strip_product(h_product[sublist]))
    
for sublist in range(max_page):
    combined_prices.append(strip_price(h_price[sublist]))

#create a list of all prices
all_prices = []
for by_page in combined_prices:
    for li in by_page:
        #print(li, '\n')
        all_prices.append(float(li))
        
#create a list of all products
all_products = []
for by_page in combined_products:
    for li in by_page:
        #print(li, '\n')
        all_products.append(li)

In [5]:
import pandas as pd
product_ser = pd.Series(all_products)
price_ser = pd.Series(all_prices)

pot = pd.DataFrame([product_ser, price_ser]).T

In [18]:
#write to csv and set index to position rather than name, use
pot.to_csv('prices05082018.csv', encoding='utf8', header=['item_name', 'item_price'], index_label='relative_position')

#did not explicitly specify utf8 encoding for first file, now need to specify ansi when opening
pot0 = pd.read_csv('prices05072018.csv', encoding='ansi')
pot0.set_index(pot0['0'], inplace=True)
pot1 = pot0[["1"]]
pot1.columns = ['item_price']
pot1.index.name = 'item_name'

#put in loop when more are available
pot2 = pd.read_csv('prices05082018.csv', index_col='item_name')
pot2 = pot2[['item_price']]

In [35]:
pot_all = pd.concat([pot1, pot2], axis=1)
pot_all.columns = ['item_price_' + str(i) for i in range(1, (len(pot_all.columns)) + 1)]

print(pot_all.head(30))

                                        item_price_1  item_price_2
1:1 Cannabis RSO                               40.00         40.00
9lb Hammer Shatter                             19.00         19.00
9lb Hammer Shatter 5-pack ($15 OFF)            80.00         80.00
ATF Oil Cartridge 1 gram                       65.00         65.00
Amnesia Pre-roll 1 gram                         9.00          9.00
BIC Lighter                                     1.75          1.75
Berry Larry                                    10.00         10.00
Big Bud Skunk CO2 Oil                          22.00         22.00
Big Bud Skunk CO2 Oil 3-pack ($12 OFF)         54.00         54.00
Blackberry Gummy                                5.00          5.00
Blackberry Kush Shatter                        20.00         20.00
Blue Dragon Oil Cartridge 1/2 gram             40.00         33.00
Blue Dream Pre-roll 1 gram                      8.00          8.00
Blue Magoo Pre-roll 1 gram                     11.00         1

In [55]:
#check which items have changed price, sold out, or been added
for i in range(len(pot_all)):
    p = 0
    if (pot_all.iloc[i][p] != pot_all.iloc[i][p+1]):
        print(pot_all.iloc[i])

item_price_1    40.0
item_price_2    33.0
Name: Blue Dragon Oil Cartridge 1/2 gram, dtype: float64
item_price_1    33.0
item_price_2    40.0
Name: Blue Mystic Oil Cartridge 1 gram, dtype: float64
item_price_1    22.0
item_price_2     NaN
Name: CBD Pre-roll Sampler 3-pack ($5 OFF), dtype: float64
item_price_1    42.0
item_price_2    40.0
Name: Fruit Punch Oil Cartridge 1/2 gram, dtype: float64
item_price_1    40.0
item_price_2    42.0
Name: Jilly Bean Live Resin Cartridge 1/2 gram, dtype: float64
