In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
import time

In [2]:
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///vnexpress2.db'
db = SQLAlchemy(app)

In [3]:
class ArticleModel(db.Model):
    __tablename__ = 'articles'
    id = db.Column(db.Integer, primary_key=True)
    url = db.Column(db.String, unique=True, nullable=False)
    title = db.Column(db.String)
    description = db.Column(db.String)
    page_contents = db.Column(db.Text)
    category = db.Column(db.String)
    publish_date = db.Column(db.String)
    author = db.Column(db.String)

with app.app_context():
    db.create_all()

In [4]:
def get_article_links(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    articles = soup.find_all('h3', class_='title-news')

    links = []
    for article in articles:
        a_tag = article.find('a', href=True)
        if a_tag:
            full_url = a_tag['href']
            if full_url.startswith('/'):
                full_url = 'https://vnexpress.net' + full_url
            links.append(full_url)
    return links

In [5]:
def get_article_info(url):
    article = Article(url, language='vi')
    article.download()
    article.parse()

    soup = BeautifulSoup(article.html, 'html.parser')

    # Ngày đăng
    date_tag = soup.find('span', class_='date')
    publish_date = date_tag.text.strip() if date_tag else None

    # Tác giả (lấy từ <p class="Normal" style="text-align:right;">)
    author_tag = soup.find('p', class_='Normal', style=lambda v: v and 'text-align:right' in v)
    author = author_tag.find('strong').text.strip() if author_tag and author_tag.find('strong') else None

    return {
        'url': url,
        'title': article.title,
        'description': article.meta_description,
        'page_contents': article.text,
        'publish_date': publish_date,
        'author': author
    }


In [6]:
def save_to_db(article_data, category='Khoa học'):
    with app.app_context():
        exists = ArticleModel.query.filter_by(url=article_data['url']).first()
        if not exists:
            article = ArticleModel(
                url=article_data['url'],
                title=article_data['title'],
                description=article_data['description'],
                page_contents=article_data['page_contents'],
                category=category,
                publish_date=article_data.get('publish_date'),
                author=article_data.get('author')
            )
            db.session.add(article)
            db.session.commit()

In [7]:
def crawl_and_save_articles(article_links, category='Khoa học'):
    for idx, url in enumerate(article_links):
        try:
            print(f"[{idx+1}/{len(article_links)}] Đang xử lý: {url}")
            article_data = get_article_info(url)
            save_to_db(article_data, category=category)
            time.sleep(1)
        except Exception as e:
            print(f"Lỗi khi xử lý {url}: {e}")
            continue

In [None]:
# Thực thi ví dụ
khoa_hoc_url = 'https://vnexpress.net/khoa-hoc'
article_links = get_article_links(khoa_hoc_url)
crawl_and_save_articles(article_links, category='Khoa học')

In [None]:
# Thêm chuyên mục Giáo dục
giao_duc_url = 'https://vnexpress.net/giao-duc'
edu_links = get_article_links(giao_duc_url)
crawl_and_save_articles(edu_links, category='Giáo dục')

In [None]:
phap_luat_url = 'https://vnexpress.net/phap-luat'
law_links = get_article_links(phap_luat_url)
crawl_and_save_articles(law_links, category='Pháp luật')

In [None]:
giai_tri_url = 'https://vnexpress.net/giai-tri'
ent_links = get_article_links(giai_tri_url)
crawl_and_save_articles(ent_links, category='Giải trí')