In [1]:
import os
import csv
import time
from datetime import datetime, timezone
from io import BytesIO

import requests
from googletrans import Translator
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
import logging

class LogCustomFormatter(logging.Formatter):
    grey = "\x1b[0;37m"
    green = "\x1b[1;32m"
    bold_red = "\x1b[31;1m"
    yellow = "\x1b[1;33m"
    red = "\x1b[1;31m"
    purple = "\x1b[1;35m"
    blue = "\x1b[1;34m"
    light_blue = "\x1b[1;36m"
    reset = "\x1b[0m"
    blink_red = "\x1b[5m\x1b[1;31m"

    format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)'

    FORMATS = {
        logging.DEBUG: grey + format + reset,
        logging.INFO: grey + format + reset,
        logging.WARNING: yellow + format + reset,
        logging.ERROR: red + format + reset,
        logging.CRITICAL: bold_red + format + reset
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)
    
def setup_logger(logger_name, output_dir = None):
    # Remove existing handlers if the logger already exists
    logger = logging.getLogger(logger_name)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

    logger.setLevel(logging.DEBUG)
    
    # create console handler with a higher log level
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    # create formatter and add it to the handlers
    formatter = LogCustomFormatter()
    ch.setFormatter(formatter)
    # add the handlers to logger
    logger.addHandler(ch)
    
    if output_dir:
        # create file handler which logs even debug messages
        fh = logging.FileHandler(os.path.join(output_dir, 'crawl_log.log'))
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    return logger


## Crawl data

In [3]:
import glob

def is_today(date_str):
    today = datetime.today().strftime('%Y-%m-%d')
    return date_str == today

def get_newest_folder_name(directory, pattern='*-*-*'):
    folders = glob.glob(os.path.join(directory, pattern), recursive=True)
    today = datetime.today().strftime('%Y-%m-%d')
    
    if not folders:
        return f'{directory}/{today}'
    
    current_newest_folder = max(folders, key=os.path.getctime)
    current_newest_folder_date = current_newest_folder.split('/')[-1]
    
    return newest_folder if is_today(current_newest_folder_date) else f'{directory}/{today}'


In [4]:
root_dir = f'{os.getcwd()}/data'

folder_name = get_newest_folder_name(root_dir)

images_dir = os.path.join(
    folder_name,
    'images'
)

captions_path = os.path.join(
    folder_name,
    'captions.csv'
)

img_path = f'images/'

os.makedirs(root_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

WEBDRIVER_DELAY_TIME_INT = 2

In [5]:
class CrawlData:
    def __init__(self, name, logger, page_start, page_end, lock):
        chrome_options = webdriver.ChromeOptions()

        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--allow-running-insecure-content')
        chrome_options.add_argument('--accept-insecure-certs')
        chrome_options.add_argument('--incognito')

        chrome_options.headless = True

        self.translator = Translator()
        
        self.name = name
        self.logger = logger
        self.page_start = page_start
        self.page_end = page_end
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.implicitly_wait(4)
        self.wait = WebDriverWait(self.driver, WEBDRIVER_DELAY_TIME_INT)
        self.page_idx = page_start
        self.lock = lock
    
    def run(self):
        categories = ['kh-vu-tru', 'anh-dep', 'moi-truong', 'moi-truong/thien-nhien', 'con-trung-vi-khuan', 'cong-nghe/phat-minh-kh']
        for self.category in categories:
            for page_idx in range(self.page_start, self.page_end):
                news_page_urls = self._get_main_page_information(self.category, page_idx)
                self.page_idx = page_idx
                for news_page_url in news_page_urls:
                    self._get_news_page_information(news_page_url)
                    time.sleep(WEBDRIVER_DELAY_TIME_INT)
                    self.driver.back()
                    time.sleep(WEBDRIVER_DELAY_TIME_INT)
    
    def _get_main_page_information(self, category, page_idx):
        main_url = f'https://kienthuckhoahoc.org/{category}/page{page_idx}'
        self.driver.get(main_url)
        news_lst_xpath = './/a[@class="title"]'
        time.sleep(WEBDRIVER_DELAY_TIME_INT)
        news_tags = self.driver.find_elements(
            By.XPATH,
            news_lst_xpath
        )

        return [news_tag.get_attribute('href') for news_tag in news_tags]

    def _get_news_page_information(self, url):
        self.driver.get(url)
        img_box_xpath = './/div[@class="img-box"]/img'
        time.sleep(WEBDRIVER_DELAY_TIME_INT)
        img_box_tags = self.driver.find_elements(
            By.XPATH,
            img_box_xpath
        )

        img_box_captions_set = set()
        if img_box_tags:
            for img_box_tag in img_box_tags:
                self._get_image_information(img_box_captions_set)
    
    def _get_image_information(self, img_box_captions_set):
        self.lock.acquire()
        try:
            img_tag = self.wait.until(
                EC.visibility_of_element_located(
                    (
                        By.TAG_NAME,
                        'img'
                    )
                )
            )

            img_caption = img_tag.get_attribute('alt')

            if img_caption not in img_box_captions_set:
                img_box_captions_set.add(img_caption)
                img_url = img_tag.get_attribute('src')

                if img_url[-3:] != 'gif' and img_caption and img_url:
                    self._save_image_information(img_url, img_caption)
                else:
                    self.logger.error(
                        f'{self.category} - Page: {self.page_idx} / {(self.page_end - 1)} - Failed to save image: Image not found')

        except Exception as e:
            self.logger.error(
                f'{self.category} - Page: {self.page_idx} / {(self.page_end - 1)} - Failed to save image: {e}')

        time.sleep(WEBDRIVER_DELAY_TIME_INT)
        self.lock.release()

    def _save_image_information(self, img_url, img_caption):
        img_url_resp = requests.get(img_url)
        translation = self.translator.translate(img_caption, dest='en')
        img_caption = translation.text

        img = Image.open(BytesIO(img_url_resp.content))

        if img.mode == 'P':
            img = img.convert('RGB')

        img_name = f'IMG_{int(datetime.now(timezone.utc).timestamp() * 1000)}.jpg'
        img_save_path = os.path.join(images_dir, img_name)
        
        if not os.path.exists(captions_path):
            with open(captions_path, 'w', encoding='utf8', newline='') as f:
                csv_writer = csv.writer(f, delimiter='|')
                csv_writer.writerow(['image', 'caption'])

        with open(captions_path, 'a+', encoding='utf8', newline='') as f:
            csv_writer = csv.writer(f, delimiter='|')
            csv_writer.writerow([img_path + img_name, img_caption])
            img.save(img_save_path)

            self.logger.info(
                f'{self.category} - Page: {self.page_idx} / {(self.page_end - 1)} - Success saved image: {img_name} - {img_caption}')
    

In [6]:
import logging

from threading import Thread, Lock

class CrawlThreadGroup:
    def __init__(self, num_thread = 1, page_size = 1):
        self.num_thread = num_thread
        self.page_size = page_size

    def run(self):
        page_start = 1
        page_end = page_start + self.page_size
        running_threads = []
        for i in range(self.num_thread):
            try:
                thread = CrawlThread(f'thread {i}', page_start, page_end)
                thread.start()
                running_threads.append(thread)
                page_start = page_end
                page_end = page_start + self.page_size
            except Exception as e:
                print(e)

        for running_thread in running_threads:
            running_thread.join()
            running_thread.stop()

class CrawlThread(Thread):
    def __init__(self, name, page_start, page_end):
        super(CrawlThread, self).__init__()
        self.logger = setup_logger(name)
        self.logger.setLevel(logging.INFO)
        self.name = name
        self.page_start = page_start
        self.page_end = page_end
        self.lock = Lock()
        self.should_run = True

    def run(self):
        self.logger.info(self.name + " is starting...")
        while self.should_run:
            process = CrawlData(self.name, self.logger, self.page_start, self.page_end, self.lock)
            process.run()
            self.logger.info(self.name + " DONE!")
            self.should_run = False
    
    def stop(self):
        self.logger.info(self.name + " is leaving...")
        self.should_run = False  # Method to stop the thread externally

    def is_running(self):
        return self.should_run


## Start Crawl Data

In [7]:
PAGE_SIZE = 15
NUM_THREAD = 3
crawl_thread_group = CrawlThreadGroup(NUM_THREAD, PAGE_SIZE)
crawl_thread_group.run()

[0;37m2024-02-06 21:29:47,516 - thread 0 - INFO - thread 0 is starting... (213778126.py:40)[0m
[0;37m2024-02-06 21:29:47,517 - thread 1 - INFO - thread 1 is starting... (213778126.py:40)[0m
[0;37m2024-02-06 21:29:47,518 - thread 2 - INFO - thread 2 is starting... (213778126.py:40)[0m
[0;37m2024-02-06 21:29:54,405 - thread 0 - INFO - thread 0 - kh-vu-tru - Page: 1 / 15 - Success saved image: IMG_1707229794401.jpg - The giant black hole is aggressive, preventing its host galaxy from giving birth to new stars (94257464.py:118)[0m
[0;37m2024-02-06 21:29:54,626 - thread 2 - INFO - thread 2 - kh-vu-tru - Page: 31 / 45 - Success saved image: IMG_1707229794614.jpg - Simulation of Australia's first robotic explorer on the surface of the Moon. (94257464.py:118)[0m
[0;37m2024-02-06 21:30:00,153 - thread 1 - INFO - thread 1 - kh-vu-tru - Page: 16 / 30 - Success saved image: IMG_1707229800138.jpg - Scientists discovered there is atomic oxygen in the atmosphere of Venus. (94257464.py:118)