In [1]:
# !pip install requests-html selenium arsenic pandas

##  Sync vs Async

The Chess Game Analogy

Consecutive vs Concurrent

In [2]:
%%time

import time

iteration_times = [1, 3, 2, 4]


def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)


def run():
    for i, second in enumerate(iteration_times):
        sleeper(second, i=i)
    
# run()

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 7.39 µs


In [3]:
start = time.time()
iteration_times = [1, 3, 2, 1]
import asyncio

async def a_sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    await asyncio.sleep(seconds) # coroutine
    
    ellap = time.time() - start
    print(f"{i} done {ellap}")
    return "abc"

async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(a_sleeper(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-2' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-3' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-4' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-5' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>]
0.0007688999176025391
0	1s
1	3s
2	2s
3	1s


## Blocking & Timeouts

In [4]:
def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)

sleeper(12)

In [5]:
async def asleeper(seconds, i=-1):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.sleep(seconds)
    
await asleeper(12)

0 done 12.013737201690674
3 done 12.01388692855835
2 done 12.013914108276367
1 done 12.013946056365967


In [6]:
print("hello word")

hello word


In [7]:
loop = asyncio.get_event_loop()
# loop = asyncio.new_event_loop()
# aysncio.run()


loop.create_task(asleeper(123))

<Task pending name='Task-7' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:1>>

In [8]:
print("hello word")

hello word


In [9]:
done, pending = await asyncio.wait([asleeper(1), asleeper(123)], timeout=2)
done, pending

({<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>},
 {<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7ff590386eb0>()]>>})

In [10]:
done

{<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>}

In [11]:
pending

{<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7ff590386eb0>()]>>}

In [12]:
# await asyncio.wait(pending)

In [13]:
await asyncio.wait_for(asleeper(5), timeout=3)

TimeoutError: 

In [None]:
try:
    await asyncio.wait_for(asleeper(5), timeout=3)
except asyncio.TimeoutError:
    print("Task failed")

In [None]:
async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    
await asleeper_timeout(12, timeout=1)

## Scraping with Selenium - Synchronous
New to selenium and web scraping? Watch [this series](https://kirr.co/dwy90n).

In [20]:
url = 'https://www.spoonflower.com/en/shop?on=fabric'

In [29]:
import re
import requests
from requests_html import HTML
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [19]:
def scraper(url):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver.page_source


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']

In [21]:
content = scraper(url)


In [23]:
html_r = HTML(html=content)

In [28]:
fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]

datas = []
for path in fabric_links:
    id_, slug_ = extract_id_slug(path)
    print(id_, slug_)
    data = {
        "id": id_,
        "slug": slug_,
        "path": path,
        "scraped": 0 # True / False -> 1 / 0 
    }
    datas.append(data)

7137786 genevieve-floral-by-crystal_walen
8286001 hanging-out-by-sarah_knight
4893900 half-scale-m81-woodland-camo-by-ricraynor
8056679 ruth-bader-ginsgurg-rbg-bust-black-by-katerhees
8197261 night-sky-stars-midnight-blue-by-at_the_cottage
8692520 bees-lemons-large-blue-by-fernlesliestudio
5048115 mexican-blanket-by-anchored_by_love
2920223 m81-woodland-camo-by-ricraynor
9060289 saints-fleur-de-lis-new-orleans-saints-football-football-fabric-fleur-de-lis-fabric-black-gold-gold-f-by-charlottewinter
7137898 sierra-floral-by-crystal_walen
6178734 fable-floral-blush-med-by-nouveau_bohemian
1096407 skull-wall-by-ben_goetting
6852245 cute-nurse-love-black-no-gradient-by-jannasalak
5469666 galaxy-far-far-away-gray-by-studiofibonacci
6327300 call-mountains-evergreen-med-by-nouveau_bohemian
2330040 maryland-flags-by-elramsay
5700186 puzzle-hearts-by-designedbygeeks
5880084 mod-triangles-gold-indigo-by-crystal_walen
7216659 rainbow-stars-watercolor-abstract-small-by-crystal_walen
5588706 black-l

In [30]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,id,slug,path,scraped
0,7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0
1,8286001,hanging-out-by-sarah_knight,/en/fabric/8286001-hanging-out-by-sarah_knight,0
2,4893900,half-scale-m81-woodland-camo-by-ricraynor,/en/fabric/4893900-half-scale-m81-woodland-cam...,0
3,8056679,ruth-bader-ginsgurg-rbg-bust-black-by-katerhees,/en/fabric/8056679-ruth-bader-ginsgurg-rbg-bus...,0
4,8197261,night-sky-stars-midnight-blue-by-at_the_cottage,/en/fabric/8197261-night-sky-stars-midnight-bl...,0


In [33]:
df.to_csv("local.csv", index=False)

In [34]:
pd.read_csv("local.csv")

Unnamed: 0,id,slug,path,scraped
0,7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0
1,8286001,hanging-out-by-sarah_knight,/en/fabric/8286001-hanging-out-by-sarah_knight,0
2,4893900,half-scale-m81-woodland-camo-by-ricraynor,/en/fabric/4893900-half-scale-m81-woodland-cam...,0
3,8056679,ruth-bader-ginsgurg-rbg-bust-black-by-katerhees,/en/fabric/8056679-ruth-bader-ginsgurg-rbg-bus...,0
4,8197261,night-sky-stars-midnight-blue-by-at_the_cottage,/en/fabric/8197261-night-sky-stars-midnight-bl...,0
...,...,...,...,...
79,509390,spoonflower-color-map-by-spoonflower_help,/en/fabric/509390-spoonflower-color-map-by-spo...,0
80,1112778,rosie-riveter-by-spacefem,/en/fabric/1112778-rosie-riveter-by-spacefem,0
81,9453318,african-american-girls-retro-pop-art-by-whimsi...,/en/fabric/9453318-african-american-girls-retr...,0
82,5247883,hexo-blue-med-by-nouveau_bohemian,/en/fabric/5247883-hexo-blue-med-by-nouveau_bo...,0


## Asynchronous Scraping with `chromedriver` and `arsenic`

[arsenic Docs](https://arsenic.readthedocs.io/en/latest/)

In [37]:
# !pip install arsenic