In [None]:
from multiprocessing import Pool
from lxml import etree
import aiohttp
import asyncio
import time

In [None]:
!pip install nest_asyncio --quiet
import nest_asyncio
nest_asyncio.apply()

Exercise: Use async for http requests and multiprocessing for parsing html. Compare the performance with the practices in the lecture note



In [None]:
urls = ['https://arxiv.org/abs/2201.000%02d'%i for i in range(1, 11)]

In [None]:
htmls: list
async def get_html(url):
      async with aiohttp.ClientSession() as session:
          async with session.request('GET', url) as resp:
              html = await resp.read()
              htmls.append(html)

def parse_html(html, cnt):
  title = etree.HTML(html).xpath('//h1[contains(@class, "title")]/text()')
  print('Title %d: %s' % (cnt,''.join(title)))

Complete the following routines

In [None]:
def main_get_html():
  '''
  use asyncio
  '''
  loop = asyncio.get_event_loop()
  tasks = [get_html(url) for url in urls]
  loop.run_until_complete(asyncio.wait(tasks))

def main_parse_html():
  '''
  use multiprocessing
  '''
  p = Pool(4)
  for i, html in enumerate(htmls):
    p.apply_async(parse_html, args=(html, i))
  p.close()
  p.join()
  # pass

In [None]:
%%time
htmls = []
main_get_html()
main_parse_html()

Title 0: Modeling Advection on Directed Graphs using Matérn Gaussian Processes for Traffic Flow
Title 3: Simulating local fields in carbon nanotube reinforced composites for infinite strip with voidsTitle 2: Improving Deep Neural Network Classification Confidence using Heatmap-based eXplainable AITitle 1: Robust reliability-based topology optimization under random-field material modelTitle 4: Time-Dependent Duhamel Renormalization method with Multiple Conservation and Dissipation Laws


Title 6: Locally finite free space as limiting case of PT-symmetric medium
Title 5: A Lightweight and Accurate Spatial-Temporal Transformer for Traffic Forecasting

Title 8: A Literature Review on Length of Stay Prediction for Stroke Patients using Machine Learning and Statistical ApproachesTitle 9: Confidence-Aware Multi-Teacher Knowledge Distillation

Title 7: AttentionLight: Rethinking queue length and attention mechanism for traffic signal control
CPU times: user 89.3 ms, sys: 42.3 ms, total: 132 ms

How about using multi threads for parsing html? Try it out

In [None]:
import threading

class ParseHtmlThread(threading.Thread):
    def __init__(self, cnt, html):
        threading.Thread.__init__(self)
        self.cnt = cnt
        self.html = html
        
    def run(self):
        title = etree.HTML(self.html).xpath('//h1[contains(@class, "title")]/text()')
        self.title = ''.join(title)
        print('Title %d: %s' % (self.cnt,self.title))


UPDATE_INTERVAL = 0.01

def process_requests(threads):
    def alive_count():
        alive = [1 if thread.is_alive() else 0 for thread in threads]
        return sum(alive)

    while alive_count() > 0:
        time.sleep(UPDATE_INTERVAL)

In [None]:
%%time
htmls = []
main_get_html()

threads = [ParseHtmlThread(cnt, html) for cnt, html in enumerate(htmls)]
for thread in threads:
    thread.start()
process_requests(threads)

Title 0: AttentionLight: Rethinking queue length and attention mechanism for traffic signal control
Title 1: Modeling Advection on Directed Graphs using Matérn Gaussian Processes for Traffic Flow
Title 2: Confidence-Aware Multi-Teacher Knowledge Distillation
Title 3: A Lightweight and Accurate Spatial-Temporal Transformer for Traffic Forecasting
Title 4: Locally finite free space as limiting case of PT-symmetric medium
Title 5: Robust reliability-based topology optimization under random-field material model
Title 6: Improving Deep Neural Network Classification Confidence using Heatmap-based eXplainable AI
Title 7: Time-Dependent Duhamel Renormalization method with Multiple Conservation and Dissipation Laws
Title 8: Simulating local fields in carbon nanotube reinforced composites for infinite strip with voids
Title 9: A Literature Review on Length of Stay Prediction for Stroke Patients using Machine Learning and Statistical Approaches
CPU times: user 87.8 ms, sys: 7.39 ms, total: 95.2 m