In [1]:
# !pip install requests-html selenium arsenic pandas

##  Sync vs Async

The Chess Game Analogy

Consecutive vs Concurrent

In [2]:
%%time

import time

iteration_times = [1, 3, 2, 4]


def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)


def run():
    for i, second in enumerate(iteration_times):
        sleeper(second, i=i)
    
# run()

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 8.82 µs


In [3]:
start = time.time()
iteration_times = [1, 3, 2, 1]
import asyncio

async def a_sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    await asyncio.sleep(seconds) # coroutine
    
    ellap = time.time() - start
    print(f"{i} done {ellap}")
    return "abc"

async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(a_sleeper(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-2' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-3' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-4' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-5' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>]
0.0007050037384033203
0	1s
1	3s
2	2s
3	1s


## Blocking & Timeouts

In [4]:
def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)

sleeper(12)

In [5]:
async def asleeper(seconds, i=-1):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.sleep(seconds)
    
await asleeper(12)

0 done 12.023157119750977
3 done 12.023277044296265
2 done 12.023310899734497
1 done 12.023338079452515


In [6]:
print("hello word")

hello word


In [7]:
loop = asyncio.get_event_loop()
# loop = asyncio.new_event_loop()
# aysncio.run()


loop.create_task(asleeper(123))

<Task pending name='Task-7' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:1>>

In [8]:
print("hello word")

hello word


In [9]:
done, pending = await asyncio.wait([asleeper(1), asleeper(123)], timeout=2)
done, pending

({<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>},
 {<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fdbd83216a0>()]>>})

In [10]:
done

{<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>}

In [11]:
pending

{<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fdbd83216a0>()]>>}

In [12]:
# await asyncio.wait(pending)

In [13]:
await asyncio.wait_for(asleeper(5), timeout=3)

TimeoutError: 

In [14]:
try:
    await asyncio.wait_for(asleeper(5), timeout=3)
except asyncio.TimeoutError:
    print("Task failed")

Task failed


In [15]:
async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    
await asleeper_timeout(12, timeout=1)

TimeoutError: 

## Scraping with Selenium - Synchronous
New to selenium and web scraping? Watch [this series](https://kirr.co/dwy90n).

In [16]:
url = 'https://www.spoonflower.com/en/shop?on=fabric'

In [17]:
import re
import requests
from requests_html import HTML
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [18]:
def scraper(url):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver.page_source


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']

In [19]:
content = scraper(url)

In [20]:
html_r = HTML(html=content)

fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]

datas = []
for path in fabric_links:
    id_, slug_ = extract_id_slug(path)
    print(id_, slug_)
    data = {
        "id": id_,
        "slug": slug_,
        "path": path,
        "scraped": 0 # True / False -> 1 / 0 
    }
    datas.append(data)

8692520 bees-lemons-large-blue-by-fernlesliestudio
4352750 loteria-by-jellymania
7137786 genevieve-floral-by-crystal_walen
6650975 love-nurse-charcoal-gray-by-phyllisdobbs
5544045 napoleonic-bees-faux-gilt-on-blackest-black-by-peacoquettedesigns
7944022 golden-girls-illustration-peach-by-yesterdaycollection
7580754 ibd-gracie-grace-golden-jumbo-by-indybloomdesign
5247883 hexo-blue-med-by-nouveau_bohemian
6444170 catching-fireflies-by-thestorysmith
7236018 australian-native-eucalyptus-leaves-edition-1-australiana-fabric-wallpaper-by-erin__kendal
3817098 math-count-on-by-sammyk
9060289 saints-fleur-de-lis-new-orleans-saints-football-football-fabric-fleur-de-lis-fabric-black-gold-gold-f-by-charlottewinter
7368347 dear-clementine-oranges-teal-by-crystal_walen
6812243 cute-kawaii-sushi-small-size-by-penguinhouse
7216659 rainbow-stars-watercolor-abstract-small-by-crystal_walen
4995362 heart-health-awareness-black-by-ohdarkthirty
5048115 mexican-blanket-by-anchored_by_love
6327300 call-mounta

In [21]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,id,slug,path,scraped
0,8692520,bees-lemons-large-blue-by-fernlesliestudio,/en/fabric/8692520-bees-lemons-large-blue-by-f...,0
1,4352750,loteria-by-jellymania,/en/fabric/4352750-loteria-by-jellymania,0
2,7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0
3,6650975,love-nurse-charcoal-gray-by-phyllisdobbs,/en/fabric/6650975-love-nurse-charcoal-gray-by...,0
4,5544045,napoleonic-bees-faux-gilt-on-blackest-black-by...,/en/fabric/5544045-napoleonic-bees-faux-gilt-o...,0


In [22]:
df.to_csv("local.csv", index=False)

In [23]:
pd.read_csv("local.csv")

Unnamed: 0,id,slug,path,scraped
0,8692520,bees-lemons-large-blue-by-fernlesliestudio,/en/fabric/8692520-bees-lemons-large-blue-by-f...,0
1,4352750,loteria-by-jellymania,/en/fabric/4352750-loteria-by-jellymania,0
2,7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0
3,6650975,love-nurse-charcoal-gray-by-phyllisdobbs,/en/fabric/6650975-love-nurse-charcoal-gray-by...,0
4,5544045,napoleonic-bees-faux-gilt-on-blackest-black-by...,/en/fabric/5544045-napoleonic-bees-faux-gilt-o...,0
...,...,...,...,...
79,3840217,nurse-theme-by-hot4tees_bg-yahoo_com,/en/fabric/3840217-nurse-theme-by-hot4tees_bg-...,0
80,6864327,love-lips-red-by-hipkiddesigns,/en/fabric/6864327-love-lips-red-by-hipkiddesigns,0
81,6650888,love-nurse-whimsy-blue-by-phyllisdobbs,/en/fabric/6650888-love-nurse-whimsy-blue-by-p...,0
82,5964319,hearts-on-grey-linen-valentines-day-by-littlea...,/en/fabric/5964319-hearts-on-grey-linen-valent...,0


## Asynchronous Scraping with `chromedriver` and `arsenic`

[arsenic Docs](https://arsenic.readthedocs.io/en/latest/)

In [24]:
# !pip install arsenic

In [41]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body

async def store_links_as_df_pickle(datas=[], name='links.pkl'):
    df = pd.DataFrame(datas)
    df.set_index('id', drop=True, inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    df = await store_links_as_df_pickle(links)
    return links
    
if __name__ == "__main__":
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    results = asyncio.run(run(url))
    print(results)


Overwriting async_scrape.py


In [43]:
!python async_scrape.py

https://www.spoonflower.com/en/shop?on=fabric


In [36]:
name = 'links.pkl'
df = pd.read_pickle(name)
df.head()

Unnamed: 0_level_0,slug,path,scraped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0
4893900,half-scale-m81-woodland-camo-by-ricraynor,/en/fabric/4893900-half-scale-m81-woodland-cam...,0
7661255,just-jellies-jellyfish-by-katerhees,/en/fabric/7661255-just-jellies-jellyfish-by-k...,0
5034356,80s-hair-dryers-by-cjldesigns,/en/fabric/5034356-80s-hair-dryers-by-cjldesigns,0
2623675,black-white-music-notes-by-inspirationz,/en/fabric/2623675-black-white-music-notes-by-...,0


In [33]:
df.shape

(84, 3)