## **1. Import necessary libraries**

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.M4JcDNYym6/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.g8ECIVI3xB/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.0kecAiLh5s/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



In [11]:
import pandas as pd
import os
import requests
import time
import random
import shutil

from io import BytesIO
from PIL import Image
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## **2.Crawl**

In [None]:
# Initialize google chrome browser
def initialize_browser():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome('chromedriver', options=chrome_options)
    return driver

# Create a folder for storing articles
def create_folder(folder_path):
    shutil.rmtree(folder_path,ignore_errors=True)
    os.makedirs(folder_path, exist_ok=True)

# Access the table page and get article URLs
def get_article_urls(driver, page_url):
    driver.get(page_url)
    articles_path = '//div[@id="automation_TV0"]/div/article/div/a'
    tags = driver.find_elements(By.XPATH, articles_path)
    article_urls = [tag.get_attribute('href') for tag in tags]
    return article_urls

# Access the article page and get article content
def get_article_content(driver, url):
    driver.get(url)
    print(url)
    time.sleep(0.1)

    main_content_path = '//section[5]'
    try:
        main_content_tag = driver.find_element(By.XPATH, main_content_path)
    except:
        return None

    title_path = '//div[@class="sidebar-1"]/h1'
    title = main_content_tag.find_element(By.XPATH, title_path).text.strip()
    

    description_path = '//div[@class="sidebar-1"]/p'
    description = main_content_tag.find_element(By.XPATH, description_path).text.strip()

    author_path=['//div[@class="sidebar-1"]/article/p[@style="text-align:right;"]/strong',
                 '//div[@class="sidebar-1"]/article/p[@class="author_mail"]/strong']
    
    author=''
    for path in author_path:
        try:              
            author = main_content_tag.find_element(By.XPATH, path).text.strip()
        except:
            continue
    if author=='':
        author="Can't find author"
        
    
    content_path='//div[@class="sidebar-1"]/article/p[@class="Normal"]'
    content_tag=main_content_tag.find_elements(By.XPATH,content_path)
    #content_tag = main_content_tag.find_elements(By.CLASS_NAME, 'Normal')
    content_list = [content.text.strip() for content in content_tag]
    content = ' '.join(content_list)
    return title, description, content, author

# Save article to .txt file
def save_article(article_path, content):
    with open(article_path, 'w') as f:
        f.write(content)

def crawl_website_data():
    folder_content = './vnexpress_news_corpus'
    number_of_pages = 10
    article_id = 0

    driver = initialize_browser()
    create_folder(folder_content)

    for page in range(1, number_of_pages + 1):
        page_url = f'https://vnexpress.net/kinh-doanh/chung-khoan-p{page}'
        article_urls = get_article_urls(driver, page_url)

        for url in article_urls:
            article_content = get_article_content(driver, url)
            if article_content is None:
                continue

            title, description, content, author = article_content
            final_content_list = [title, description, content, author]
            final_content = '\n\n'.join(final_content_list)

            article_filename = f'article_{article_id+1:05d}.txt'
            article_path = os.path.join(folder_content, article_filename)
            article_id += 1

            save_article(article_path, final_content)

            #driver.back()

    

crawl_website_data()


In [16]:
!zip -r vnexpress_news_corpus.zip vnexpress_news_corpus

  adding: vnexpress_news_corpus/ (stored 0%)
  adding: vnexpress_news_corpus/article_00097.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00127.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00029.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00136.txt (deflated 60%)
  adding: vnexpress_news_corpus/article_00062.txt (deflated 56%)
  adding: vnexpress_news_corpus/article_00081.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00035.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00021.txt (deflated 55%)
  adding: vnexpress_news_corpus/article_00028.txt (deflated 57%)
  adding: vnexpress_news_corpus/article_00053.txt (deflated 53%)
  adding: vnexpress_news_corpus/article_00143.txt (deflated 51%)
  adding: vnexpress_news_corpus/article_00025.txt (deflated 56%)
  adding: vnexpress_news_corpus/article_00019.txt (deflated 54%)
  adding: vnexpress_news_corpus/article_00044.txt (deflated 56%)
  adding: vnexpress_news_corpus/article_00128