In [1]:
from requests.exceptions import ConnectionError, HTTPError, MissingSchema, ReadTimeout
import logging
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta, date
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from enum import Enum, IntEnum

In [5]:
#important! in the pipeline

action_type = "/nowe?page="
start_page = 25
end_page = 25
website_url = "https://www.pepper.pl"
articles_to_retrieve = 61



class ScrapWebpage:

    def __init__(self, website_url, action_type, articles_to_retrieve, start_page=1):
        self.website_url = website_url
        self.action_type = action_type
        self.articles_to_retrieve = articles_to_retrieve
        self.start_page = start_page


    def scrap_data(self):

        try:
            url_to_scrap = self.website_url + self.action_type + str(self.start_page)
            driver = webdriver.Chrome('./chromedriver') 
            driver.get(url_to_scrap) 
            time.sleep(0.7)
            page = driver.page_source
            soup = BeautifulSoup(page, 'html.parser')
            return soup
        except ConnectionError as e:
            print(f"ConnectionError occured: {e}. \nTry again later")
        except MissingSchema as e:
            print(f"MissingSchema occured: {e}. \nMake sure that protocol indicator is icluded in the website url")
        except HTTPError as e:
            print(f"HTTPError occured: {e}. \nMake sure that website url is valid")
        except ReadTimeout as e:
            print(f"ReadTimeout occured: {e}. \nTry again later")


    def infinite_scroll_handling(self):

        try:
            flag = True
            retrived_articles = list()

            while flag:
                soup = self.scrap_data()
                articles = soup.find_all('article')
                retrived_articles += articles

                if len(retrived_articles) >= self.articles_to_retrieve:
                    flag = False

                self.start_page += 1

            return retrived_articles[:self.articles_to_retrieve]
        except IndexError as e:
            raise IndexError("There aren't that many articles, try retrieve lower quantity of articles")

        
    
    def get_items_details(self):


        retrived_articles = self.infinite_scroll_handling()

        all_items = list()

        for article in retrived_articles:
            item = list()
            item.append(GetItemId(article).get_data())
            item.append(GetItemName(article).get_data())
            item.append(GetItemDiscountPrice(article).get_data())
            item.append(GetItemPercentageDiscount(article).get_data())
            item.append(GetItemRegularPrice(article).get_data())
            item.append(GetItemAddedDate(article).get_data())
            item.append(GetItemUrl(article).get_data())
            all_items.append(item)

        return all_items


 

output = ScrapWebpage(website_url, action_type, articles_to_retrieve)
#output.get_items_details()
article = output.infinite_scroll_handling()[0]


  driver = webdriver.Chrome('./chromedriver')


In [2]:
#important! in the pipeline


class GetItemName:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            name = self.article.find_all(attrs={'class': "cept-tt thread-link linkPlain thread-title--list js-thread-title"})
            name = name[0].get_text()
            return name
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_name): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_name): {e}")


class GetItemId:

    def __init__(self, article):
        self.article = article

    def get_data(self):
        try:
            item_id = self.article["id"]
            item_id = item_id.strip('thread_')
            return item_id
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_id): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_id): {e}")


class GetItemDiscountPrice:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            discount_price = self.article.find_all(attrs={'class': "thread-price text--b cept-tp size--all-l size--fromW3-xl"})
            discount_price = float(discount_price[0].get_text().strip('zł').replace('.','').replace(',','.'))
            return discount_price
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_discount_price): {e}")



class GetItemRegularPrice:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            regular_price = self.article.find_all(attrs={'class': "mute--text text--lineThrough size--all-l size--fromW3-xl"})
            regular_price = float(regular_price[0].get_text().strip('zł').replace('.','').replace(',','.'))
            return regular_price 
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_regular_price): {e}")


class GetItemPercentageDiscount:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            percentage_discount = self.article.find_all(attrs={'class': "space--ml-1 size--all-l size--fromW3-xl"})
            percentage_discount = float(percentage_discount[0].get_text().strip('%'))
            return percentage_discount
        except IndexError as e:
            return "NA"
        except ValueError as e:
            return "NA"
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_percentage_discount): {e}")


class GetItemUrl:

    def __init__(self, article):
        self.article = article
    
    def get_data(self):
        try:
            item_url = self.article.find_all('a', href=True, text=True)
            item_url = item_url[0]['href']
            return item_url
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_url): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_url): {e}")




In [39]:
print(article)

out = GetItemAddedDate(article)
date_tag = out.get_data()

d = out.find_true_date(date_tag)
print(d)

<article class="thread cept-thread-item thread--type-list imgFrame-container--scale thread--deal" data-handler="history thread-click" data-history='{"endpoint":"https://www.pepper.pl/nowe","replace":true,"data":{"scrollTo":"#thread_690332","offset":70,"scrollContainer":"#main"},"events":["history","click"],"delegate":true}' data-ocular='{"thread_ids":690332}' data-t="thread" data-t-d='{"id":690332}' data-t-view="" data-t-view-twig="" id="thread_690332"><div class="threadGrid thread-clickRoot"><div class="threadGrid-image space--r-3"><span class="imgFrame imgFrame--darken imgFrame--noBorder thread-listImgCell"><img alt="[ ebook ] Trylogia 4MK - J.D. Barker - Czwarta Małpa, Piąta Ofiara, Szóste dziecko @ Woblink" class="thread-image width--all-auto height--all-auto imgFrame-img" height="300" loading="lazy" src="https://static.pepper.pl/threads/raw/E4CIG/690332_1/re/300x300/qt/60/690332_1.jpg" width="300"/></span></div><div class="threadGrid-headerMeta"><div class="flex boxAlign-ai--all-c

IndexError: list index out of range

In [38]:
#important! in the pipeline


class Months(Enum):

    sty = '01'
    lut = '02'
    mar = '03'
    kwi = '04'
    maj = '05'
    cze = '06'
    lip = '07'
    sie = '08'
    wrz = '09'
    paz = '10'
    paź = '10'
    lis = '11'
    gru = '12'

    @classmethod
    def to_dict(cls):
        """Returns a dictionary representation of the enum."""
        return {e.name: e.value for e in cls}
    
    @classmethod
    def keys(cls):
        """Returns a list of all the enum keys."""
        return cls._member_names_
    
    @classmethod
    def values(cls):
        """Returns a list of all the enum values."""
        return list(cls._value2member_map_.keys())

class GetItemAddedDate:

    def __init__(self, article):
        self.article = article

    def get_data(self):
        try:
            date_tag = self.article.find_all(attrs={'class': "hide--fromW3"})
            #date = self.find_true_date(date_tag)
            return date_tag
        except IndexError as e:
            raise IndexError(f"Index out of the range (item_url): {e}")
        except TypeError as e:
            raise TypeError(f"Invalid html class name (item_url): {e}")

    def find_true_date(self, date_tag):

        flag = False

        print(self.first_index_date_searching(date_tag))
        print(self.second_index_date_searching(date_tag))
        print(self.third_index_date_searching(date_tag))

        """try:
            self.first_index_date_searching()
            if flag == True:
                return date
            elif flag == False:
                self.second_index_date_searching()
                if flag == True:
                    return date
                elif flag == False:
                    self.third_index_date_searching()
                    if flag == True:
                        return date
                    else:
                        print('another error')

        except:
            print("error1")"""

    def data_format_conversion(self, date_tag):

        old_dates_data_pattern = "[A-Za-z]+\s\d\d\.\s[0-9]+"

        try:
            if date_tag.endswith(('min', 'g', 's', 'temu')):
                prepared_data = date.today().strftime("%d-%m-%Y")
                return prepared_data
            elif date_tag.startswith(tuple(Months.keys())) and len(date_tag) < 8:      
                if len(scraped_publication_date[4:]) == 3:
                    day = scraped_publication_date[4:6]
                else:
                    day = scraped_publication_date[4:5].zfill(2)
                month = Months.__members__[date_tag[0:3]].value
                year = str(date.today().year)
                prepared_data = '-'.join([str(day), month, year])
                return prepared_data
            elif bool(re.search(old_dates_data_pattern, date_tag)):
                day = date_tag[4:6]
                month = Months.__members__[date_tag[0:3]].value
                year = date_tag[8:13]
                prepared_data = '-'.join([day, month, year])
                return prepared_data
        except KeyError as e:
            raise KeyError(f"Invalid name of the month {e}")


    def first_index_date_searching(self, date_tag):

        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        date_tag = date_tag[0].get_text()
        date = self.data_format_conversion(date_tag)

        flag = True
        try:
            formatted_data = self.data_format_conversion(date_tag)
            if bool(re.search(output_data_pattern, formatted_data)):
                return f"{self.data_format_conversion(date_tag)} + ' string 1 success'", flag
        except Exception as e:
            flag = False
            return flag
            print(e)

    def second_index_date_searching(self, date_tag):
        
        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        date_tag = date_tag[1].get_text()
        date = self.data_format_conversion(date_tag)

        flag = True
        try:
            formatted_data = self.data_format_conversion(date_tag)
            if bool(re.search(output_data_pattern, formatted_data)):
                return f"{self.data_format_conversion(date_tag)} + ' string 2 success'", flag
        except Exception as e:
            flag = False
            return flag
            print(e)

    def third_index_date_searching(self, date_tag):
        
        output_data_pattern = "\d{2}[/.-]\d{2}[/.-]\d{4}"

        date_tag = date_tag[2].get_text()
        date = self.data_format_conversion(date_tag)

        flag = True
        try:
            formatted_data = self.data_format_conversion(date_tag)
            if bool(re.search(output_data_pattern, formatted_data)):
                return f"{self.data_format_conversion(date_tag)} + ' string 3 success'", flag
        except Exception as e:
            flag = False
            return flag
            print(e)







        

