In [4]:
# !pip install selenium webdriver-manager
# !apt-get update # to update ubuntu to correctly run apt install
# !apt install chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [5]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import requests
from tqdm import tqdm
import numpy as np

In [10]:
def driver_setup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    # driver = webdriver.Chrome('chromedriver', options=options)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def get_soup(URL, driver):
    r = requests.get(URL)
    if r:
        driver.get(URL)
        htmlContent = driver.page_source
        soup =  BeautifulSoup(htmlContent, 'html.parser')
        return soup
    else:
        # when return HTML error such as 404
        print("Error Fetching Soup, Scraping Ended")
        return None

def get_link_from_soup(soup):
    link_list = set()
    try:
        wanted_div = soup.findAll("div", {"id":"list",
                                         "class":"list ect-entry-card front-page-type-index"})[0]
    except IndexError:
        print("HTML element not found, returning empty set")
        return set()
    for div in wanted_div.findAll("a", {"class":"column_indexItem_thumb"}):
        wanted_article = div["href"]
        link_list.add(wanted_article)
    return link_list

def crawler(chunk, driver):
  links_in_chunk = set()
  for i in tqdm(chunk):
    URL = f"https://www.goo-net.com/magazine/new/page/{i}/"
    soup = get_soup(URL, driver)
    if soup:
        current_links = get_link_from_soup(soup)
        if len(current_links) > 0: # continue until no tags are found 
            links_in_chunk.update(current_links)
        else: # if tags we want are not found
            return links_in_chunk
    else: # if encountered HTML error such as 404
        return links_in_chunk
  return links_in_chunk

def save_link_as_txt(link_list):
    file_path = "links/goo-net.txt"
    # if file exist, get its content, merge to existing set and then overwrite
    if os.path.isfile(file_path):
        with open(file_path, 'r+', encoding="utf-8") as file:
            # get existing links in file
            old_links = set(file.readlines())
            # remove duplicates 
            link_list = link_list - old_links
            file.seek(0)
            for i in link_list:
                file.write(i+"\n")
            file.truncate()
    else: # if not, make new file
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w", encoding="utf-8") as file:
            for i in link_list:
                file.write(i+"\n")
    print(f"Links Saved in {file_path}")
    return True

In [11]:
from concurrent.futures import ThreadPoolExecutor

thread = 8
drivers = [driver_setup() for _ in range(thread)]
chunks = np.array_split(np.arange(1,1761), thread)
all_links = set()

with ThreadPoolExecutor(max_workers=thread) as executor:
    bucket = executor.map(crawler, chunks, drivers)

[driver.quit() for driver in drivers]

for i in bucket:
  all_links.update(i)

save_link_as_txt(all_links)

  0%|          | 0/220 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A





[A[A[A[A[A[A




  0%|          | 1/220 [00:08<30:12,  8.28s/it]





[A[A[A[A[A[A




[A[A[A[A[A


[A[A[A

[A[A
[A



  1%|          | 2/220 [00:14<25:17,  6.96s/it]





[A[A[A[A[A[A
  1%|▏         | 3/220 [00:23<28:56,  8.00s/it]

[A[A



[A[A[A[A


[A[A[A




[A[A[A[A[A





  2%|▏         | 4/220 [00:32<30:09,  8.38s/it]
[A

[A[A





[A[A[A[A[A[A




[A[A[A[A[A


[A[A[A



  3%|▎         | 6/220 [00:46<26:27,  7.42s/it]





[A[A[A[A[A[A

[A[A
[A




[A[A[A[A[A


  3%|▎         | 7/220 [00:56<29:22,  8.27s/it]





[A[A[A[A[A[A



[A[A[A[A
[A




[A[A[A[A[A


[A[A[A

[A[A





  4%|▎         | 8/220 [01:04<29:48,  8.44s/it]



[A[A[A[A
[A




[A[A[A[A[A





[A[A[A[A[A[A


  4%|▍         | 9/220 [01:14<30:43,  8.74s/it]

[A[A



[A[A[A[A





  5%|▍         | 10/220 [01:22<29:40,  

Error Fetching Soup, Scraping Ended






[A[A[A



[A[A[A[A
[A




[A[A[A[A[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A
[A

[A[A


[A[A[A




[A[A[A[A[A



[A[A[A[A

[A[A
[A



[A[A[A[A




100%|██████████| 220/220 [49:12<00:00, 13.42s/it]


[A[A
[A


100%|██████████| 220/220 [49:16<00:00, 13.44s/it]


100%|██████████| 220/220 [49:20<00:00, 13.46s/it]




[A[A[A[A
100%|██████████| 220/220 [49:24<00:00, 13.47s/it]




[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



100%|██████████| 220/220 [50:05<00:00, 13.66s/it]


Links Saved in links/goo-net.txt


True