# Webscrapping from https://50style.pl/
### Articels and prices from each subpage

In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
from time import time, sleep
from datetime import datetime
import random

In [2]:
# list of free proxies 
proxies = [{"http": "95.178.108.89"},
           {"http": "95.178.108.189"},
           {"http": "80.48.119.28"},
           {"http": "192.166.255.200"},
           {"http": "89.174.108.158"},
           {"http": "193.106.231.145"},
           {"http": "91.222.147.56"},
           {"http": "77.65.163.170"},
           {"http": "146.0.32.208"},
           {"http": "144.76.116.242"},
           {"http": "185.113.6.254"},
           {"http": "176.9.75.42"},
           {"http": "77.235.161.156"},
           {"http": "173.249.39.131"},
           {"http": "194.233.69.90"}]

In [3]:
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

In [9]:
def scrap_50style(main_url = "https://50style.pl/", subpages = ['meskie','damskie', 'dzieciece']):
    '''
    main_url = main url adress without any subpages
    subpages = list of subpages, ie = ['meskie','damskie', 'dzieciece'], to get 'https://50style.pl/dzieciece'
    
    '''
    start = time() # start time
    
    session = requests.Session()
    session.max_redirects = 60
    cookies = dict(cookies_are='working')
    
    # empty dataframe to collect data
    pricelist = pd.DataFrame({"Article" : [],
                 "Price" : [],
                 "RRP": []})

    # iteration through each subpage
    for id, subpage in enumerate(subpages):
        for n in range(1,100):
            
            # iterate through each number of subpage
            url = main_url + str(subpage) + '?sort=default&limit=180&page=' + str(n) 

            # get whole structure of data, and avoid getting blocked, by add headers parameter
            page = session.get(url, headers = headers,cookies=cookies) 

            # let parse our web page
            soup = BeautifulSoup(page.content)
            
            # iterate throug all products on one page
            for product in soup.find_all(attrs = {'class': 'b-itemList_name'}):
                link ="https://50style.pl" + product.find('a')['href']

                # choose random proxy
                proxy = random.choice(proxies)
                
                page1 = session.get(link, headers = headers,proxies = proxy, timeout = 98,cookies=cookies) 
                soup1 = BeautifulSoup(page1.content)

                try:
                  index = soup1.find(attrs = {'class': "m-accordion_productCode"}).text
                  price = soup1.find(attrs = {'class': "price-new"}).find(attrs = {'class': "price-value"}).text
                  rrp = soup1.find(attrs = {'class': "price-old"}).find(attrs = {'class': "price-value"}).text
                except:
                  continue
                else:
                  pricelist = pricelist.append({'Article' : index,
                               'Price' : price,
                               'RRP' : rrp},
                              ignore_index = True)

                # sleep for one seconds after each link to avoid block
                sleep(1)

            # check if there is another subpage
            next_page_no = soup.find(class_='m-pagination').find_all('span')[-1].text.replace('z ',"").strip()

            if next_page_no == '':
                break
                
            print("number of subpage: ", subpage, n)    
            # sleep for a second to avoid block
            sleep(1)

    end = time()
    print(round(end - start,2), 'calculation time in sec')
    
    return pricelist

In [10]:
# if we want only specific brands
groups = ['buty']

In [11]:
data = scrap_50style(subpages=groups)

number of subpage:  buty 1
number of subpage:  buty 2
number of subpage:  buty 3
number of subpage:  buty 4
number of subpage:  buty 5
number of subpage:  buty 6
number of subpage:  buty 7
4493.56 calculation time in sec


In [12]:
data

Unnamed: 0,Article,Price,RRP
0,FX9323,34399,39999
1,UMJKO220003,12039,15999
2,UPJIO220002,8599,15999
3,UPJIO220003,8599,17999
4,UPWO221003,13759,25999
...,...,...,...
1262,81546U-C44,7739,19999
1263,T3452,4999,19999
1264,81553U-JC8,7999,19999
1265,T3428,4999,19999


In [13]:
today = datetime.today().strftime('%Y-%m-%d')
data.to_csv("50style_" + today + ".csv",index = False)