In [1]:
# !pip install selenium webdriver-manager
# !apt-get update # to update ubuntu to correctly run apt install
# !apt install chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import requests
from tqdm import tqdm
import numpy as np

In [4]:
def driver_setup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    # driver = webdriver.Chrome('chromedriver', options=options)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def get_soup(URL, driver):
    r = requests.get(URL)
    if r:
        driver.get(URL)
        htmlContent = driver.page_source
        soup =  BeautifulSoup(htmlContent, 'html.parser')
        return soup
    else:
        # when return HTML error such as 404
        print("Error Fetching Soup, Scraping Ended")
        return None

def get_link_from_soup(soup):
    link_list = set()
    try:
        wanted_div = soup.findAll("div", {"id":"tile-2",})[0]
    except IndexError:
        print("HTML element not found, returning empty set")
        return set()
    for div in wanted_div.findAll("div", {"class":"toc grid clearfix"}):
        wanted_article = div.find('a', href=True)["href"]
        link_list.add(wanted_article)
    return link_list

def crawler(chunk, driver):
  links_in_chunk = set()
  for i in tqdm(chunk):
    URL = f"https://creative311.com/?paged={i}"
    soup = get_soup(URL, driver)
    if soup:
        current_links = get_link_from_soup(soup)
        if len(current_links) > 0: # continue until no tags are found 
            links_in_chunk.update(current_links)
        else: # if tags we want are not found
            return links_in_chunk
    else: # if encountered HTML error such as 404
        return links_in_chunk
  return links_in_chunk

def save_link_as_txt(link_list):
    file_path = "links/creative311.txt"
    # if file exist, get its content, merge to existing set and then overwrite
    if os.path.isfile(file_path):
        with open(file_path, 'r+', encoding="utf-8") as file:
            # get existing links in file
            old_links = set(file.readlines())
            # remove duplicates 
            link_list = link_list - old_links
            file.seek(0)
            for i in link_list:
                file.write(i+"\n")
            file.truncate()
    else: # if not, make new file
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w", encoding="utf-8") as file:
            for i in link_list:
                file.write(i+"\n")
    print(f"Links Saved in {file_path}")
    return True

In [21]:
# [driver.quit() for driver in drivers]

In [6]:
from concurrent.futures import ThreadPoolExecutor

thread = 8
drivers = [driver_setup() for _ in range(thread)]
chunks = np.array_split(np.arange(1,2081), thread)
all_links = set()

with ThreadPoolExecutor(max_workers=thread) as executor:
    bucket = executor.map(crawler, chunks, drivers)

[driver.quit() for driver in drivers]

for i in bucket:
  all_links.update(i)

save_link_as_txt(all_links)

  0%|          | 0/260 [00:00<?, ?it/s]
[A


[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A





  0%|          | 1/260 [00:07<31:36,  7.32s/it]


[A[A[A

[A[A




[A[A[A[A[A
[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A
  1%|          | 2/260 [00:11<23:43,  5.52s/it]

[A[A



[A[A[A[A


[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A

[A[A
[A



[A[A[A[A


[A[A[A





  1%|          | 3/260 [00:16<21:37,  5.05s/it]




[A[A[A[A[A



[A[A[A[A

  2%|▏         | 4/260 [00:19<19:21,  4.54s/it]





[A[A[A[A[A[A


[A[A[A
[A




[A[A[A[A[A


[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A




  2%|▏         | 5/260 [00:24<19:32,  4.60s/it]
[A





[A[A[A[A[A[A




[A[A[A[A[A


[A[A[A



  2%|▏         | 6/260 [00:28<18:37,  4.40s/it]

[A[A
[A





[A[A[A[A[A[A




[A[A[A[A[A



[A[A[A[A

[A[A
[A





  3%|▎         | 7/260 [00:33<19:15,  4.5

Error Fetching Soup, Scraping Ended








[A[A[A[A[A


[A[A[A
[A





[A[A[A[A[A[A



[A[A[A[A

[A[A




[A[A[A[A[A


[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A


[A[A[A



[A[A[A[A





[A[A[A[A[A[A


[A[A[A




[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A
[A


[A[A[A




[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A
[A




[A[A[A[A[A


[A[A[A



[A[A[A[A





[A[A[A[A[A[A

[A[A




[A[A[A[A[A
[A



[A[A[A[A


[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A
[A


[A[A[A

[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A
[A


[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A


[A[A[A

[A[A



[A[A[A[A
[A




[A[A[A[A[A





[A[A[A[A[A[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A

[A[A
[A



[A[A[A[A




[A[A[A[A[A



Error Fetching Soup, Scraping Ended





[A[A





[A[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A
[A





[A[A[A[A[A[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A
[A

[A[A



[A[A[A[A
[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A
[A



[A[A[A[A





[A[A[A[A[A[A

[A[A




[A[A[A[A[A



[A[A[A[A





[A[A[A[A[A[A
[A




[A[A[A[A[A

[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A

[A[A
[A



[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A
[A




[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A
[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A
[A




[A[A[A[A[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A



[A[A

HTML element not found, returning empty set









[A[A[A[A[A[A
[A



[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A
[A



[A[A[A[A





[A[A[A[A[A[A

[A[A
[A





[A[A[A[A[A[A



[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A





[A[A[A[A[A[A



[A[A[A[A

[A[A





[A[A[A[A[A[A
[A



[A[A[A[A





[A[A[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A
[A



[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A





[A[A[A[A[A[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A



[A[A[A[A
[A

[A[A





[A[A[A[A[A[A



[A[A[A[A

[A[A
[A





[A[A[A[A[A[A



[A[A[A[A

[A[A
[A





 69%|██████▉   | 180/260 [10:29<04:39,  3.50s/it]
 88%|████████▊ | 230/260 [10:29<01:22,  2.74s/it]

Error Fetching Soup, Scraping Ended
Error Fetching Soup, Scraping Ended





[A[A



[A[A[A[A



[A[A[A[A

[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[A



[A[A[A[A

[A[A



[A[A[A[

Links Saved in links/creative311.txt


True