In [1]:
from requests.exceptions import ConnectionError, HTTPError, MissingSchema, ReadTimeout
import logging
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta, date
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from enum import Enum, IntEnum

In [352]:
#important! in the pipeline

action_type = "/nowe?page="
start_page = 25
website_url = "https://www.pepper.pl"
articles_to_retrieve = 61



class ScrapWebpage:

    def __init__(self, website_url, action_type, articles_to_retrieve, start_page=1):
        self.website_url = website_url
        self.action_type = action_type
        self.articles_to_retrieve = articles_to_retrieve
        self.start_page = start_page


    def scrap_data(self):

        try:
            url_to_scrap = self.website_url + self.action_type + str(self.start_page)
            driver = webdriver.Chrome('./chromedriver') 
            driver.set_window_size(1400,1000)
            driver.get(url_to_scrap) 
            time.sleep(0.7)
            page = driver.page_source
            soup = BeautifulSoup(page, 'html.parser')
            return soup
        except ConnectionError as e:
            print(f"ConnectionError occured: {e}. \nTry again later")
        except MissingSchema as e:
            print(f"MissingSchema occured: {e}. \nMake sure that protocol indicator is icluded in the website url")
        except HTTPError as e:
            print(f"HTTPError occured: {e}. \nMake sure that website url is valid")
        except ReadTimeout as e:
            print(f"ReadTimeout occured: {e}. \nTry again later")


    def infinite_scroll_handling(self):

        try:
            flag = True
            retrived_articles = list()

            while flag:
                soup = self.scrap_data()
                articles = soup.find_all('article')
                retrived_articles += articles

                if len(retrived_articles) >= self.articles_to_retrieve:
                    flag = False
                    return retrived_articles[:self.articles_to_retrieve]

                self.start_page += 1

        except IndexError as e:
            raise IndexError("There aren't that many articles, try retrieve lower quantity of articles")

        
    
    def get_items_details(self):


        retrived_articles = self.infinite_scroll_handling()

        all_items = list()

        #print(retrived_articles)

        for article in retrived_articles:
            item = list()
            item.append(GetItemId(article).get_data())
            item.append(GetItemName(article).get_data())
            item.append(GetItemDiscountPrice(article).get_data())
            item.append(GetItemPercentageDiscount(article).get_data())
            item.append(GetItemRegularPrice(article).get_data())
            #item.append(GetItemAddedDate(article).get_data())
            item.append(GetItemUrl(article).get_data())
            all_items.append(item)

        return all_items


 

output = ScrapWebpage(website_url, action_type, articles_to_retrieve)

print(output.get_items_details())
#retrived = output.infinite_scroll_handling()
#print(retrived)


  driver = webdriver.Chrome('./chromedriver')


[['690986', 'Papier toaletowy 3 warstwowy - Lidl', 29.99, 'NA', 'NA', 'https://www.pepper.pl/promocje/papier-toaletowy-3-warstwowy-lidl-690986'], ['690985', 'Powerbank Baseus 30000 mAh czarny 20W', 97.9, 'NA', 'NA', 'https://www.pepper.pl/promocje/powerbank-baseus-30000-mah-czarny-20w-690985'], ['690984', 'Czajnik AMICA INOX KM 2012', 99.99, -17.0, 120.2, 'https://www.pepper.pl/promocje/czajnik-amica-inox-km-2012-690984'], ['690983', 'Słuchawki bezprzewodowe QCY T13 ANC wodoodporne', 149.99, 'NA', 'NA', 'https://www.pepper.pl/promocje/sluchawki-bezprzewodowe-qcy-t13-anc-wodoodporne-690983'], ['690981', 'Waga Timemore Black Mirror Basic PRO', 199.0, -26.0, 269.0, 'https://www.pepper.pl/promocje/waga-timemore-black-mirror-basic-pro-690981'], ['690982', 'Smartwchat Garmin Epix 2 PRO 51mm €888,34', 3953.0, -19.0, 4899.0, 'https://www.pepper.pl/promocje/smartwchat-garmin-epix-2-pro-51mm-690982'], ['690980', 'Kupon na 10zl w Pyszne.pl', 'NA', 'NA', 'NA', 'https://www.pepper.pl/kupony/kupon-n

In [353]:
#important! in the pipeline


class GetItemName:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            name = self.article.find_all(attrs={'class': "cept-tt thread-link linkPlain thread-title--list js-thread-title"})
            name = name[0].get_text()
            return name
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_name): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_name): {e}")


class GetItemId:

    def __init__(self, article):
        self.article = article

    def get_data(self):
        try:
            item_id = self.article["id"]
            item_id = item_id.strip('thread_')
            return item_id
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_id): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_id): {e}")


class GetItemDiscountPrice:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            discount_price = self.article.find_all(attrs={'class': "thread-price text--b cept-tp size--all-l size--fromW3-xl"})
            discount_price = float(discount_price[0].get_text().strip('zł').replace('.','').replace(',','.'))
            return discount_price
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_discount_price): {e}")



class GetItemRegularPrice:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            regular_price = self.article.find_all(attrs={'class': "mute--text text--lineThrough size--all-l size--fromW3-xl"})
            regular_price = float(regular_price[0].get_text().strip('zł').replace('.','').replace(',','.'))
            return regular_price 
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_regular_price): {e}")


class GetItemPercentageDiscount:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            percentage_discount = self.article.find_all(attrs={'class': "space--ml-1 size--all-l size--fromW3-xl"})
            percentage_discount = float(percentage_discount[0].get_text().strip('%'))
            return percentage_discount
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_percentage_discount): {e}")


class GetItemUrl:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            item_url = self.article.find_all('a', href=True, text=True)
            item_url = item_url[0]['href']
            return item_url
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_url): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_url): {e}")




In [354]:

action_type = "/nowe?page="
start_page = 25
website_url = "https://www.pepper.pl"
articles_to_retrieve = 61


data = ScrapWebpage(website_url, action_type, articles_to_retrieve)

article = data.infinite_scroll_handling()[60]

output = GetItemAddedDate(article).get_data()
#print(output)

out = GetItemAddedDate(article)
date_tag = out.get_data()


print(date_tag)
#d = out.find_true_date(date_tag)
#print(d)

  driver = webdriver.Chrome('./chromedriver')


[<span class="metaRibbon lbox--v-1 boxAlign-ai--all-c overflow--wrap-off space--l-3 text--color-greyShade"><svg class="icon icon--clock text--color-greyShade space--mr-1" height="22px" width="22px"><use xlink:href="/assets/img/ico_a7847.svg#clock"></use></svg><span>6 g, 7 min</span></span>]


In [355]:
#important! in the pipeline


class Months(Enum):

    sty = '01'
    lut = '02'
    mar = '03'
    kwi = '04'
    maj = '05'
    cze = '06'
    lip = '07'
    sie = '08'
    wrz = '09'
    paz = '10'
    paź = '10'
    lis = '11'
    gru = '12'

    @classmethod
    def to_dict(cls):
        """Returns a dictionary representation of the enum."""
        return {e.name: e.value for e in cls}
    
    @classmethod
    def keys(cls):
        """Returns a list of all the enum keys."""
        return cls._member_names_
    
    @classmethod
    def values(cls):
        """Returns a list of all the enum values."""
        return list(cls._value2member_map_.keys())

class GetItemAddedDate:

    def __init__(self, article):
        self.article = article

    def get_data(self):
        try:
            date_tag = self.article.find_all(attrs={'class': "metaRibbon lbox--v-1 boxAlign-ai--all-c overflow--wrap-off space--l-3 text--color-greyShade"})
            #date = self.find_true_date(date_tag)
            return date_tag
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_url): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_url): {e}")

    def find_true_date(self, date_tag):


        try:
            true_data = self.first_index_date_searching(date_tag)
            print("1")
            return true_data
        except Exception:
            print("bad1")
            try:
                true_data = self.second_index_date_searching(date_tag)
                print("2")
                return true_data
            except Exception:
                print("bad2")
                try:
                    true_data = self.third_index_date_searching(date_tag)
                    print("3")
                    return true_data
                except Exception:
                    print("bad3")


    def data_format_conversion(self, date_string_likely):

        old_dates_data_pattern = "[A-Za-z]+\s\d\d\.\s[0-9]+"

        try:
            if date_string_likely.startswith("Zaktualizowano"):
                date_string_likely = date_string_likely.lstrip("Zaktualizowano ") 
            elif date_string_likely.endswith("Lokalnie"):
                date_string_likely = date_string_likely.rstrip("Lokalnie") 
        except Exception:
            return date_string_likely

        try:
            if date_string_likely.endswith(('min', 'g', 's', 'temu')):
                prepared_data = date.today().strftime("%d-%m-%Y")
                return prepared_data
            elif date_string_likely.startswith(tuple(Months.keys())) and len(date_tag) < 8:      
                if len(date_string_likely[4:]) == 3:
                    day = date_string_likely[4:6]
                else:
                    day = date_string_likely[4:5].zfill(2)
                month = Months.__members__[date_string_likely[0:3]].value
                year = str(date.today().year)
                prepared_data = '-'.join([str(day), month, year])
                return prepared_data
            elif bool(re.search(old_dates_data_pattern, date_string_likely)):
                day = date_string_likely[4:6]
                month = Months.__members__[date_string_likely[0:3]].value
                year = date_string_likely[8:13]
                prepared_data = '-'.join([day, month, year])
                return prepared_data
        except KeyError as e:
            raise KeyError(f"Invalid name of the month {e}")


    def first_index_date_searching(self, date_tag):

        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        
        #date = self.data_format_conversion(date_string_likely)

        try:
            date_string_likely = date_tag[0].get_text()
            formatted_data = self.data_format_conversion(date_string_likely)
            if bool(re.search(output_data_pattern, formatted_data)):
                return formatted_data
            else:
                raise Exception
        except Exception as e:
            print(e)

    def second_index_date_searching(self, date_tag):
        
        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        
        #date = self.data_format_conversion(date_string_likely)

        try:
            date_string_likely = date_tag[1].get_text()
            formatted_data = self.data_format_conversion(date_string_likely)
            if bool(re.search(output_data_pattern, formatted_data)):
                return formatted_data
            else:
                raise Exception
        except Exception as e:
            print(e)

    def third_index_date_searching(self, date_tag):
        
        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        
        #date = self.data_format_conversion(date_string_likely)

        try:
            date_string_likely = date_tag[2].get_text()
            formatted_data = self.data_format_conversion(date_string_likely)
            if bool(re.search(output_data_pattern, formatted_data)):
                return formatted_data
            else:
                raise Exception
        except Exception as e:
            print(e)







        



In [356]:

action_type = "/nowe?page="
start_page = 1
website_url = "https://www.pepper.pl"
articles_to_retrieve = 200


data = ScrapWebpage(website_url, action_type, articles_to_retrieve, start_page)

articles = data.infinite_scroll_handling()


  driver = webdriver.Chrome('./chromedriver')


In [455]:
article = articles[140]
n = 1 
from collections import Counter 


"""pattern1 = 
pattern2 = 
pattern3 = 
pattern4 = """
previous_date = list()
for a in articles:
    out = GetItemAddedDate(a)
    date_tag = out.get_data()
    name = a.find_all('a', {"class":"cept-tt thread-link linkPlain thread-title--list js-thread-title"})[0].get_text()

    spans = a.find_all('div', {"class":"size--all-s flex boxAlign-jc--all-fe boxAlign-ai--all-c flex--grow-1 overflow--hidden"})[0].get_text(strip=True, separator='_').split('_')
    

    to_del = list()
    for i in spans:
        if "/" in i:
            to_del.append(i)
        if i in ["Jutro", "DZISIAJ", "Lokalnie"]:
            to_del.append(i)
        if i.startswith("Wysyłka"):
            to_del.append(i)
    


    counts = Counter(to_del)

    new_data = []
    for x in spans:
        if counts[x]:
            counts[x] -= 1
        else:
            new_data.append(x)
    
    previous_date = spans

    if len(new_data) == 0:
        new_data = previous_date

        


    

    #spans = date_tag[0].find_all("span")
    #print(date_tag[0].span.string)
    #print(date_tag)
    #print(to_del)
    print(name)
    print(new_data)
    #print(f"len: {len(spans)}")
    print(f"number: {n}")
    n += 1




#print(date_tag)

Papier toaletowy 3 warstwowy - Lidl
['4 min']
number: 1
Powerbank Baseus 30000 mAh czarny 20W
['5 min']
number: 2
Czajnik AMICA INOX KM 2012
['5 min']
number: 3
Słuchawki bezprzewodowe QCY T13 ANC wodoodporne
['27/06/2023', '09/07/2023', 'Wysyłka z: Polska']
number: 4
Waga Timemore Black Mirror Basic PRO
['10 min']
number: 5
Smartwchat Garmin Epix 2 PRO 51mm €888,34
['10 min']
number: 6
Kupon na 10zl w Pyszne.pl
['12 min']
number: 7
Dare moments Intermarche
['41 min']
number: 8
Koleo 10zł na start
['46 min']
number: 9
Action kompresor Varo akumulatorowy
['50 min']
number: 10
Podkładki retro winyl 4 sztuki
['1 g, 2 min']
number: 11
Zestaw 7 podkładek pod kubek
['1 g, 2 min']
number: 12
Budżetowy projektor FullHD BlitzWolf BW-V5| $98.99
['Jutro', '30/06/2023', 'Wysyłka z: Chiny']
number: 13
Projektor BlitzWolf BW-V6 1080p $168.99 [Netflix Certified]
['1 g, 3 min']
number: 14
Makita E-10883 - Zestaw Kluczy, Bitów, Narzędzi w walizce - 221 elementów
['1 g, 11 min']
number: 15
Smartfon Sams

In [358]:

#out = GetItemAddedDate(article)
#date_tag = out.get_data()
#print(date_tag)

output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

number = 1

for a in articles: 
    out = GetItemAddedDate(a)
    date_tag = out.get_data()
    spans = date_tag[0].find_all("span")
    print(spans[0].get_text())
    print(spans[1].get_text())
    try:
        print(spans[2].get_text())
    except:
        print("none")
    #print(date_tag)
    #print(date_tag[0].get_text())
    print(number)
    number += 1
    """#date = out.first_index_date_searching(date_tag)

    try:
        if bool(re.search(output_data_pattern, out.first_index_date_searching(date_tag))): 
            print(f"{out.first_index_date_searching(date_tag)} + {number}" )
    except Exception:
        try:
            if bool(re.search(output_data_pattern, out.second_index_date_searching(date_tag))):
                print(f"{out.second_index_date_searching(date_tag)} + {number}" )
        except Exception:
            try: 
                if bool(re.search(output_data_pattern, out.third_index_date_searching(date_tag))):
                    print(f"{out.third_index_date_searching(date_tag)} + {number}" )
            except Exception as e:
                print(e)
    number += 1"""

    #print(f"{out.first_index_date_searching(date_tag)} + {number}" )
    #print(f"{date_tag} + {number}" )
"""
print(out.first_index_date_searching(date_tag))
print(out.second_index_date_searching(date_tag))
print(out.third_index_date_searching(date_tag))
"""

IndexError: list index out of range