### Webscrapping from 4f.com

Let download prices and articels selected products from 4F.com website

In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
from time import time, sleep
from datetime import datetime
import random

In [2]:
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

In [3]:
# list of free proxies 
proxies = [{"http": "95.178.108.89"},
           {"http": "95.178.108.189"},
           {"http": "80.48.119.28"},
           {"http": "192.166.255.200"},
           {"http": "89.174.108.158"},
           {"http": "193.106.231.145"},
           {"http": "91.222.147.56"},
           {"http": "77.65.163.170"},
           {"http": "146.0.32.208"},
           {"http": "144.76.116.242"},
           {"http": "185.113.6.254"},
           {"http": "176.9.75.42"},
           {"http": "77.235.161.156"},
           {"http": "173.249.39.131"},
           {"http": "194.233.69.90"}]

In [6]:
def scrap_4F(main_url = "https://4f.com.pl/", subpages = ['kobiety','mezczyzni', 'chlopcy']):
    '''
    main_url = main url adress without any subpages
    subpages = list of subpages, ie = ['kobiety','mezczyzni', 'chlopcy'], to get 'https://4f.com.pl/mezczyzni/''
    
    '''
    start = time() # start time
    
    session = requests.Session()
    session.max_redirects = 60
    cookies = dict(cookies_are='working')
    
    # empty dataframe to collect data
    pricelist = pd.DataFrame({"Article" : [],
                 "Price" : []})

    # iteration through each subpage
    for id, subpage in enumerate(subpages):
        for n in range(1,100):
            
            # iterate through each number of subpage
            url = main_url + str(subpage) + '/strona-{}.html?product_list_limit=120'.format(n)

            # get whole structure of data, and avoid getting blocked, by add headers parameter
            page = session.get(url, headers = headers,cookies=cookies) 

            # let parse our web page
            soup = BeautifulSoup(page.content)
            
            # iterate throug all products on one page
            for product in soup.find_all(attrs = {'class': 'product-item-link'}):
                link  = product.get('href')
                
                # choose random proxy
                proxy = random.choice(proxies)
                
                page1 = session.get(link, headers = headers,proxies = proxy, timeout = 90, cookies=cookies) 
                soup1 = BeautifulSoup(page1.content)

                try:
                  index = soup1.find(attrs ={'class' : "description technical"}).find(text= 'Symbol produktu:').previous_element.next_sibling.next_sibling.text
                  price = soup1.find(attrs ={'class' : "product-info-price"}).find_all(attrs = {"class": "price"})[-1].string.replace('\xa0PLN', "").replace("\xa0","")
                  price = float(price.replace(",","."))
                except:
                  continue
                else:
                  pricelist = pricelist.append({'Article' : index,
                                                'Price' : price},
                                                ignore_index = True)

                # sleep for one seconds after each link to avoid block
                sleep(1)

            # check if there is another subpage
            is_next_page = soup.find(attrs = {'class' :  "item pages-item-next"})

            if is_next_page == None:
                break
                
            print("number of subpage: ", subpage, n)    
            # sleep for one seconds to avoid block
            sleep(1)

    end = time()
    print(round(end - start,2), 'calculation time in sec')
    
    return pricelist

In [7]:
data  = scrap_4F()

number of subpage:  kobiety 1
number of subpage:  kobiety 2
number of subpage:  kobiety 3
number of subpage:  kobiety 4
number of subpage:  kobiety 5
number of subpage:  kobiety 6
number of subpage:  kobiety 7
number of subpage:  kobiety 8
number of subpage:  kobiety 9
number of subpage:  kobiety 10
number of subpage:  kobiety 11
number of subpage:  kobiety 12
number of subpage:  kobiety 13
number of subpage:  kobiety 14
number of subpage:  kobiety 15
number of subpage:  kobiety 16
number of subpage:  kobiety 17
number of subpage:  kobiety 18
number of subpage:  mezczyzni 1
number of subpage:  mezczyzni 2
number of subpage:  mezczyzni 3
number of subpage:  mezczyzni 4
number of subpage:  mezczyzni 5
number of subpage:  mezczyzni 6
number of subpage:  mezczyzni 7
number of subpage:  mezczyzni 8
number of subpage:  mezczyzni 9
number of subpage:  mezczyzni 10
number of subpage:  mezczyzni 11
number of subpage:  mezczyzni 12
number of subpage:  mezczyzni 13
number of subpage:  mezczyzni 1

In [8]:
data

Unnamed: 0,Article,Price
0,D4Z21-KUDP201-20S,599.99
1,D4Z21-KUDP201-28S,599.99
2,4FAW21TLONF040-20S,129.99
3,4FAW21TTIGF027-20S,149.99
4,D4Z21-KUDP202-43S,499.99
...,...,...
4836,J4Z20-JSPMN400B-36S,139.99
4837,J4Z20-JKUMN400-36S,129.99
4838,J4Z20-JKSM400-36S,139.99
4839,H4Z20-PCU017-27M,39.99


In [9]:
today = datetime.today().strftime('%Y-%m-%d')
data.to_csv("4F_" + today + ".csv",index = False)