In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_product_detail(field, tag, filter_context=None):
    '''returns for one product one detail from the selected tag within a filter context if applicable
    i.e. get_product_detail(soup, 'div', {"class" : "m-offer-tile__quantity"'})'''
    
    return " ".join([element.text.strip() for element in field.find_all(tag, filter_context)])

In [3]:
def get_product_details(product_card, fields_list):
    '''same as get_product_detail but on a list of fields returns all the details'''
    return (get_product_detail(product_card, tag, filter_context)
           for tag, filter_context, title
           in fields_list)

In [4]:
def get_all_products_details(products_list, fields_list):
    '''loops over all products to get the details'''
    return [get_product_details(product, fields_list) for product in products_list]

In [38]:
def get_all_categories_schema(url='https://www.kaufland.ro/oferte/saptamana-curenta.html'):
    # contact the site in order to find
    r = requests.get(url)
    
    # creating the xml
    soup = BeautifulSoup(r.text, 'lxml')
    
    # identifying all links
    links = soup.find_all('a', {"class": "m-accordion__link"})
    
    # selecting only links that have category in name and adding base link
    urls = ['https://www.kaufland.ro' + link.get('href') for link in links if link.get('href').__contains__('.category')]
    
    # selecting category ids
    category_ids = [s[s.find('=')+len('='):s.find('_')] for s in urls]
    
    # grouping ids with url skipping the category_ids that contain 'htm' as this are not categories
    return [(category_id, url) for category_id, url in zip (category_ids, urls) if not category_id.__contains__('htm')]

In [47]:
def get_web_page_as_df(category_schema):

    category_id, category_url = category_schema
    # making the request
    r = requests.get(category_url)

    # creating the xml
    soup = BeautifulSoup(r.text, 'lxml')

    # identifying avalability
    availability = [item.text for item in soup.find_all('h2') if item.text.__contains__('Valabilitate')][0]

    # identifying products boxes/cards
    products_cards = soup.find_all('a', {"class": "m-offer-tile__link u-button--hover-children"})

    #tag type, filter context and the title that it will have
    fields_to_parse = [('h5', None, 'title'),
                       ('h4', None, 'subtitle'), 
                       ('div', {"class" : "m-offer-tile__quantity"}, 'quantity'), 
                       ('div', {"class" : "m-offer-tile__basic-price"}, 'price_per_metric'), 
                       ('div', {"class" : "m-offer-tile__promo-message"}, 'promo_message'), 
                       ('div', {"class" : "a-pricetag__old-price"}, 'old_price'), 
                       ('div', {"class" : "a-pricetag__discount"}, 'discount'), 
                       ('div', {"class" : "a-pricetag__price"}, 'new_price'), 
                       ('div', {"class" : "a-eye-catcher__headline"}, 'eye_catcher'),
                       ('div', {"data-category-id" : category_id}, 'additional_offers')
                       ]

    all_products = get_all_products_details(products_cards, fields_to_parse)
    columns = [schema[2] for schema in fields_to_parse]
    online_promotions_df = pd.DataFrame(data=all_products, columns=columns)
    
    online_promotions_df['category_id'] = category_id
    online_promotions_df['url'] = ['https://www.kaufland.ro' + link.get('href') for link in products_cards]

    online_promotions_df['availability'] = availability

    online_promotions_df.loc[online_promotions_df['additional_offers'] != "",  "additional_offers"] = 0
    online_promotions_df.loc[online_promotions_df['additional_offers'] == "",  "additional_offers"] = 1
    
    return online_promotions_df

In [51]:
category_schemas = get_all_categories_schema()

In [52]:
all_online_promotions_df = pd.concat([get_web_page_as_df(category_schema) for category_schema in category_schemas])

In [53]:
all_online_promotions_df.to_csv('leaflets/pdfs_online_output/RO-43-1000_online.csv', encoding='utf-8 sig', index=False)