In [1]:
# === Set up environment ===
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib3
urllib3.disable_warnings()

import time
from datetime import datetime

import json

In [2]:
# connect to the target website

baseurl = 'https://www.uniqlo.com'

# It is important to list the exact browser version in the headers for successful implementation of request.get
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"
}

tshirts_webpage_women = 'https://www.uniqlo.com/us/en/women/tops/t-shirts'
tshirts_webpage_men = 'https://www.uniqlo.com/us/en/men/tops/t-shirts'
tshirts_webpage_kids = 'https://www.uniqlo.com/us/en/kids/tops/t-shirts'

tshirts_webpage_list = [tshirts_webpage_women, tshirts_webpage_men, tshirts_webpage_kids]



In [9]:
# ==== Data definition ====

# saving scrapted data into lists of Dictionary
#tshirts_info_women = []
#tshirts_info_men = []
#tshirts_info_kids = []

# initialize the dictionary of t-shirt information
tshirt_info = {
    'group':'women/men/kids',  # target population group
    'name': 'name',   # product name of the tshirt
    'link': 'link',  # link to the main page
    'price': 'price',  # retail price
    'sizes': 'sizes',  # available sizes of this tshirt
    'rating': 'avg_rating',  # average rating value
    'review_count': 'review_count', # number of reviews/ratings
    'materials': 'materials',  # material composition
    'care_advice': 'care_advice',  # care advice for this tshirt
    'origin': 'origin',  # imported or domestic
    'IsNew': '',
    'colors': [],  # list of available colors
    'color_sizes': [],  # list of relevant size ranges of the available colors
    'thumb_links': [],  # list of links to the relevant thumb images of the available colors
    'category': 'category',  # style category of this tshirt
    'category_link': 'category_link',  # link to the main page of the relevant style category (similar goods)
    'summary': 'summary',  # feature summry from the merchant
    'details': [],  # list of detailed descriptions from the merchant
    'scrape_time': 'scrape_time',  # finish time of scraping
}

# ==== Scraping data ====

# 1st Level: scrape data seprately from each t-shirt population group: women, men, kids
for webpage in tshirts_webpage_list:    
    tshirt_group_list = requests.get(webpage, verify = False, headers = headers)
    print(tshirt_group_list.status_code)
    population_group = webpage.split('/')[5]
    
    soup = BeautifulSoup(tshirt_group_list.content, 'html.parser')
    
    # 2nd Level: tshirt subcategory (Essential, Fashion ...): name, item number and link of webpage
    section_list = soup.find_all('div', class_ = 'subcategory-section row')
    
    for section in section_list:
        category_name = section.find("a", href = True).get_text().split('\n')[1]  # style category
        category_link = baseurl + section.find("a", href = True)['href']
        product_list = section.find_all("div", class_ = "product-tile")
        subcat_number = len(product_list)
        print(category_name + ': ', subcat_number)

        # 3rd Level: each individual product under the current subcategory
        for product in product_list:

            # basic information of the current product
            tshirt_info['group'] = population_group
            tshirt_info['name'] = product.find("a", class_ = "link").get_text()
            tshirt_info['link'] = baseurl + product.find("a", class_ = "link")['href']
            tshirt_info['IsNew'] = product.find("span", class_ = "rightBadge").get_text().split('\n')[1].strip()
            tshirt_info['sizes'] = product.find("span", class_ = "swatch-size-values").get_text()
            tshirt_info['price'] = product.find("span", class_ = "value").get_text().split('\n')[3].strip()
            tshirt_info['category'] = category_name
            tshirt_info['category_link'] = category_link

            # collect names and thumb-image links of all the available colors of the current product
            color_names = []  # names of available colors
            color_sizes = []  # specific size range of available colors
            thumb_links = []  # links of relevant thumb images of available colors

            for product_color in product.find_all('img'):
                # the first element is the default display color of the current product without links to thumb images, which should be skipped
                if product_color.get_attribute_list('data-medium-img') != [None]:
                    color_names.append(product_color['data-medium-img'].split('"')[7].split(',')[1].strip())
                    thumb_links.append(product_color['data-medium-img'].split('"')[3])
                    color_sizes.append(product_color['data-size-value'])

            tshirt_info['colors'] = color_names
            tshirt_info['color_sizes'] = color_sizes
            tshirt_info['thumb_links'] = thumb_links

            # accessing the main page of the current product to extract rating value/count, and other attributes
            product_mainpage = requests.get(tshirt_info['link'], verify = False, headers = headers)
            soup_product = BeautifulSoup(product_mainpage.content, 'html.parser')

            # average rating values & review count
            try:
                tshirt_info['rating'] = soup_product.find("span", class_="bvseo-ratingValue").get_text().strip()
            except:
                tshirt_info['rating'] = 'unknown'
            try:
                tshirt_info['review_count'] = soup_product.find("span", class_="bvseo-reviewCount").get_text().strip()
            except:
                tshirt_info['review_count'] = 'unknown'

            #product_avg_fit  # 0-very small, 100-very large; 50-suitable --- inaccessible
            #product_avg_length  # 0-very short, 100-very long; 50-suitable --- inaccessible
            #product_avg_quality  # 0-poor, 100-perfect --- inaccessible

            # material, care advice and Is-imported
            [tshirt_info['materials'], tshirt_info['care_advice'], tshirt_info['origin']] = soup_product.find("ul", class_="productSpecification").get_text().split('\n')[1:4]

            # Description of product features and highlights
            tshirt_info['summary'] = soup_product.find(id="collapsible-details-1").find("p").get_text()
            product_details = []
            for detail in soup_product.find(id="collapsible-details-1").find_all("li"):
                product_details.append(detail.get_text())
            tshirt_info['details'] = product_details   

            tshirt_info['scrape_time'] = str(datetime.now())

            # add the current product into the t-shirt information list
            # tshirts_info_women.append(tshirt_info)  # BUGS!!!!!

            # write the current product info into json output file
            print('=======' + tshirt_info['category'] + '-' + tshirt_info['name'] + '=======')
            with open('tshirts_info_uniqlo.json', 'a') as outfile:
                json.dump(tshirt_info, outfile)

            # add time span of 2 seconds between each access to the main page of a single product       
            time.sleep(2)



200
Essential:  14
Fashion:  10
Sleeveless and Tank Tops:  15
Active:  6
Crop Tops:  8
From The Men's Section:  14
200
Essential:  16
Active:  13
Oversized:  6
Tank Tops:  2
200
Long Sleeve:  5
Short Sleeve:  6


