In [42]:
'''importing required modules here'''
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
from price_parser import Price


class HipStore:
    
    '''this class is about to scrape the hipstore website'''
    
    def __init__(self,url,headers,cookies):
        
        '''initilizing class attributes here'''
        
        self.url = url
        self.headers = headers
        self.cookies = cookies
        self.product_urls = []
        self.product_dic = {}
        self.product_list = []
        self.crawl()
        
    def start_requst(self):
        
        ''' this method is to send the initial request to website
        and to return the category links'''
        
        response = requests.get(self.url,headers=self.headers,cookies=self.cookies)
        soup  = BeautifulSoup(response.text,'html.parser')
        category_links = [urljoin(self.url,link['href']) for link in soup.find_all('a',href=re.compile('w?o?mens?/footwear/'))]
        return category_links
    
        
    def get_product_urls(self,category_urls):
        
        '''using this function we are getting all the product urls 
        from all the pages of category urls that we got from previous func'''
        
        # this function helps us to navigate to each of the pages
        def pagination(soup):
            next_button = soup.find('a',{'rel':'next','title':'Next Page'})
            if next_button is not None:
                print(f'paginating..to this page{next_button["href"]}')
                self.get_product_urls([urljoin(self.url,next_button['href'])])
                
        # we getting the product urls here
        for url in category_urls:
            response = requests.get(url,headers=self.headers,cookies=self.cookies)
            soup = BeautifulSoup(response.text,'lxml')
            product_urls =[urljoin(self.url,link['href']) for link in  soup.find_all('a',class_="itemImage")]
            self.product_urls = self.product_urls+product_urls
            
            pagination(soup) # calling pagination function

        return list(set(self.product_urls)) # we are filtering the urls here and return all the urls.
    
    def parse_details(self, url_list):
        
        '''this function is about to parse the product details like
        name , brand , price size , color.. and  so on '''
        
        # to get the currency symbol
        def get_symbol(price_string):
            symbol = Price.fromstring(price_string).currency
            return symbol
        
        # to get price value as float 
        def get_amount(price_string):
            amount  = Price.fromstring(price_string).amount_float
            return amount

        
        for url in url_list:

            response= requests.get(url,headers=headers,cookies=cookies)
            soup = BeautifulSoup(response.text,'html.parser')
            product_urls = url
            try:
                brand = soup.find('div',class_='pdp-title').h1.text.strip()
            except:
                brand = ''
            try:
                name = soup.find('div',class_='pdp-title').h2.text.strip()
            except:
                name = ''
            try:
                price = soup.find('div',class_='itemPrices').span.text.strip()
                symbol = get_symbol(price) #getting symbol of the currency
                price = get_amount(price) # getting float of the price  value
            except:
                price = ''
                symbol  = ''
            try:
                colors = ','.join([li.text.strip() for li in soup.find('ul',{'class':'productSelectDropDown'}).find_all('li')])
            except :
                colors = ''
            try:
                sizes = [button.text.strip() for button in soup.find('div',{
                    'class':'productSelectDropDown options'}).find_all('button')]
                size_numbers = ','.join([re.search(r'\d\d?\.?\d?\d?',size).group() for size in sizes])  # getting size from the country
                size_country =','.join(list(set([re.search(r'[A-z]+',size).group() for size in sizes]))) # getting country from the size
                
            except :
                sizes = ''
                size_numbers = ''
                size_country = ''

            #updating dictionary
            dic = {'product_url':url,
                   'product_brand':brand,
                   'product_name':name,
                   'product_price':price,
                   'currency_symbol':symbol,
                   'product_sizes':size_numbers,
                   'size_country':size_country,
                   'product_colors':colors,
                   'product_price':price,
            }

            # appending all the products detail to list
            self.product_list.append(dic)

            print('--'*50)
            print(' '*200)
            print(f'{url}\n{name}\n{price}\n{brand}\n{sizes}\n{colors}')
            print(dic)
            print(' '*200)
            print('--'*50)

    def save_csv(self):
        df = pd.DataFrame(self.product_list)
        csv = df.to_csv('hipstore.csv',index=False)
        
    def crawl(self):
        
        ''' this function is about to call all the function and 
        to manage the crawling process'''
        
        category_urls = self.start_requst()
        product_urls = self.get_product_urls(category_urls)
        self.parse_details(product_urls)
        self.save_csv()
        

  





        

In [50]:
if __name__=="__main__":
    
    
    
    url = 'https://www.thehipstore.co.uk/'
    cookies = {
        '49746': '',
        '__rtbh.lid': '%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22qM7042gQOSYJhZCReLNe%22%7D',
        'language': 'en',
        'gdprsettings2': '{"functional":true,"performance":true,"targeting":true}',
        'gdprsettings3': '{"functional":true,"performance":true,"targeting":true}',
        '_gcl_au': '1.1.1188987379.1681101388',
        '_tt_enable_cookie': '1',
        '_ttp': 'RADpLp1b4Zrhg-2NIh02BjM3StG',
        '_fbp': 'fb.2.1681101388835.1026343447',
        'mt.v': '2.1664255124.1681101388996',
        '__pr.1psq': 'h2h5OHFGVJ',
        '_taggstar_vid': '4d767c72-d759-11ed-aebc-f13604e20454',
        '_gid': 'GA1.3.2027253970.1681201619',
        '_uetsid': 'a0ffb8e0d84211ed8e097fdb54efd665',
        '_uetvid': '42144b80d75911ed9cc3b733819a4335',
        '_ga': 'GA1.3.857185955.1681101377',
        'cto_bundle': 'fvhHKl9ZYTF6JTJCYTVzOUdmclNJeDhUUHF2M2NPS05lTHhPbDN5UVVqVTNJVFl3ZTQwZU1XcSUyQjZlektOREtQUCUyRjNNJTJGVDZlT3N5SWFTVjNZMEREdjdocTd2MEFoTTRqVVA2cTFldlk5VU52cFZqZE5ReTVwdVR4aHdxOW5JOVUzYlZIak92VjhUbzJyZ2ZQRFY2WnljTmVrQUNaV3R3Tmk0UnpBZXRRdE9XbVBXanhkQSUzRA',
        'redwp': '_',
        'hero-session-94356483-975d-44d9-a4cb-2cd3f7e7e0f9': 'author=client&expires=1712737627393&visitor=600ca114-8a3d-4797-90a3-f2fbd645511d',
        '_ga_MKDXQSME2D': 'GS1.1.1681201619.2.0.1681201627.0.0.0',
        'akavpau_VP1': '1681201948~id=fb3bcd0ad055094e4e33d3c51f3dad00',
    }

    headers = {
        'authority': 'www.thehipstore.co.uk',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-US,en;q=0.9,ta;q=0.8,hi;q=0.7',
        'cache-control': 'no-cache',
        # 'cookie': '49746=; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22qM7042gQOSYJhZCReLNe%22%7D; language=en; gdprsettings2={"functional":true,"performance":true,"targeting":true}; gdprsettings3={"functional":true,"performance":true,"targeting":true}; _gcl_au=1.1.1188987379.1681101388; _tt_enable_cookie=1; _ttp=RADpLp1b4Zrhg-2NIh02BjM3StG; _fbp=fb.2.1681101388835.1026343447; mt.v=2.1664255124.1681101388996; __pr.1psq=h2h5OHFGVJ; _taggstar_vid=4d767c72-d759-11ed-aebc-f13604e20454; _gid=GA1.3.2027253970.1681201619; _uetsid=a0ffb8e0d84211ed8e097fdb54efd665; _uetvid=42144b80d75911ed9cc3b733819a4335; _ga=GA1.3.857185955.1681101377; cto_bundle=fvhHKl9ZYTF6JTJCYTVzOUdmclNJeDhUUHF2M2NPS05lTHhPbDN5UVVqVTNJVFl3ZTQwZU1XcSUyQjZlektOREtQUCUyRjNNJTJGVDZlT3N5SWFTVjNZMEREdjdocTd2MEFoTTRqVVA2cTFldlk5VU52cFZqZE5ReTVwdVR4aHdxOW5JOVUzYlZIak92VjhUbzJyZ2ZQRFY2WnljTmVrQUNaV3R3Tmk0UnpBZXRRdE9XbVBXanhkQSUzRA; redwp=_; hero-session-94356483-975d-44d9-a4cb-2cd3f7e7e0f9=author=client&expires=1712737627393&visitor=600ca114-8a3d-4797-90a3-f2fbd645511d; _ga_MKDXQSME2D=GS1.1.1681201619.2.0.1681201627.0.0.0; akavpau_VP1=1681201948~id=fb3bcd0ad055094e4e33d3c51f3dad00',
        'dnt': '1',
        'pragma': 'no-cache',
        'referer': 'https://www.reddit.com/',
        'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    }
    hipstore = HipStore(url,headers,cookies)
    