# Data Collection

This notebook will crawl the Farfetch website for product images & text descriptions. The scope is limited to the Women's Dresses category. We will crawl 100 pages with 90 products each, totalling 9000 products.

In [53]:
import os
import glob
import json
import requests
import pprint
import time
from tqdm.notebook import tqdm

import cv2
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
headers = {
    'Origin': 'https://www.farfetch.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

def get_session(url):
    session = requests.Session()
    res = session.get(url, headers=headers)
    cookies = res.cookies
    return session

def get_request(session, url):
    res = session.get(url, headers=headers)
    soup = BeautifulSoup(res.content.decode(), 'html.parser')
    return soup

def get_product_desc(soup):
    return soup.find("div", {"id": "productDesc"}).text

In [4]:
base_url = "https://www.farfetch.com"
session = get_session(base_url)

# Crawl product list from category page & product image and description

We loop through each page & crawl the product id, url, desciprtion & image url for each product. 

In [5]:
gender = "women"
category = "dresses-1"
pages = 100

In [8]:
# session might time out from time to time and need to manually create checkpoint files

items = []
    
for page in range(1, pages+1):
    
    # make url
    url = f"https://www.farfetch.com/hk/shopping/{gender}/{category}/items.aspx?page={page}&view=90&sort=3&scale=274"
    print(url)

    # get div
    soup = get_request(session, url)
    div_all = soup.find("div", {"data-testid": "productArea"}).find_all("div", {"data-component": "ProductCard"})

    for i, div in tqdm(enumerate(div_all), total=len(div_all)):
        # get link and name
        a = div.find("a")
        href = a.get("href")
        name = a.get("aria-label")
        _split = name.split(":")
        brand = _split[0]
        name = ":".join(_split[1:])

        # get images
        m = div.find_all("meta", {"itemprop": "image"})[0]
        image = m.get("content")
        # use 240 px version
        image = image.replace("_480.jpg", "_240.jpg")

        # get product description
        url = base_url + href
        product_id = url.split(".aspx")[0].split("-")[-1]
        soup = get_request(session, url)
        desc = soup.find("div", {"data-tstid": "productDetails"}).get_text(separator='. ').strip().lower()
        desc = (
            desc.replace("conscious", "")
            .replace("new season", "")
            .replace("exclusive", "")
        )
        items.append({
            "gender": gender,
            "category": category,
            "product_id": product_id,
            "name": name,
            "brand": brand,
            "url": url,
            "product_desc": desc,
            "image": image
        })
        time.sleep(0.1)

    time.sleep(0.5)


# save csv
filename = f"./{gender}-{category}-checkpoint-page-{page}.csv"
pd.DataFrame(items).to_csv(filename, index=False)
print(f"saved to {filename}")

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=91&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=92&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=93&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=94&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=95&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=96&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=97&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=98&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=99&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

https://www.farfetch.com/hk/shopping/women/dresses-1/items.aspx?page=100&view=90&sort=3&scale=274


  0%|          | 0/90 [00:00<?, ?it/s]

saved to ./women-dresses-1-checkpoint-page-100.csv


# Combine the checkpoint CSV files

In [33]:
filenames = glob.glob(f"./{gender}-{category}-checkpoint*.csv")
df = pd.concat([pd.read_csv(f) for f in filenames])
df = df.reset_index(drop=True)

In [61]:
df

Unnamed: 0,gender,category,product_id,name,brand,url,product_desc,image
0,women,dresses-1,18093545,floral-print pleated georgette midi dress,GANNI,https://www.farfetch.com/hk/shopping/women/gan...,. . ganni . floral-print pleated georgette mid...,https://cdn-images.farfetch-contents.com/18/09...
1,women,dresses-1,17229007,open-back slip dress,Nanushka,https://www.farfetch.com/hk/shopping/women/nan...,. . nanushka . open-back slip dress . highligh...,https://cdn-images.farfetch-contents.com/17/22...
2,women,dresses-1,18310269,floral-print sweetheart neck dress,Reformation,https://www.farfetch.com/hk/shopping/women/ref...,. . reformation . floral-print sweetheart neck...,https://cdn-images.farfetch-contents.com/18/31...
3,women,dresses-1,16260247,embroidered logo T-shirt dress,Marine Serre,https://www.farfetch.com/hk/shopping/women/mar...,. . marine serre . embroidered logo t-shirt dr...,https://cdn-images.farfetch-contents.com/16/26...
4,women,dresses-1,17786874,3D jacquard warp dress,GANNI,https://www.farfetch.com/hk/shopping/women/gan...,. . ganni . 3d jacquard warp dress. highlights...,https://cdn-images.farfetch-contents.com/17/78...
...,...,...,...,...,...,...,...,...
8995,women,dresses-1,17496270,Trevor knit mini dress,Alice+Olivia,https://www.farfetch.com/hk/shopping/women/ali...,. alice+olivia . trevor knit mini dress. highl...,https://cdn-images.farfetch-contents.com/17/49...
8996,women,dresses-1,18097019,Qipao jersey short-sleeve dress,Shanghai Tang,https://www.farfetch.com/hk/shopping/women/sha...,. shanghai tang . qipao jersey short-sleeve dr...,https://cdn-images.farfetch-contents.com/18/09...
8997,women,dresses-1,17971157,jacquard off-shoulder minidress,MSGM,https://www.farfetch.com/hk/shopping/women/msg...,. msgm . jacquard off-shoulder minidress. this...,https://cdn-images.farfetch-contents.com/17/97...
8998,women,dresses-1,17820587,broderie anglaise long-sleeve dress,Marchesa Notte,https://www.farfetch.com/hk/shopping/women/mar...,. marchesa notte . broderie anglaise long-slee...,https://cdn-images.farfetch-contents.com/17/82...


In [62]:
df.to_csv(f"./{gender}-{category}.csv", index=False)

# download images

In [74]:
import contextlib
import joblib
from joblib import Parallel, delayed
from tqdm import tqdm

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """
    Context manager to patch joblib to report into tqdm progress bar given as argument
    source: https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/58936697#58936697
    """
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()
        

def download_image(product_id, image_url):
    """ Download & save image to directory """
    dest = f'../data/farfetch/images/{product_id[:2]}/{product_id}.jpg'
    if os.path.exists(dest):
        return
    
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    
    resp = requests.get(image_url)
    with open(dest, 'wb') as f:
        f.write(resp.content)
        
with tqdm_joblib(tqdm(total=len(df))):
    Parallel(n_jobs=16)(delayed(download_image)(row["product_id"], row["image"]) for _, row in df.iterrows())

100%|██████████████████████████████████████████████████████████████████████████████| 9000/9000 [11:49<00:00, 12.69it/s]
