In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_vnexpress_health(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = []
        
        article_links = soup.find_all('h3', class_='title-news')
        
        for link in article_links:
            article_url = link.a['href']
            article = scrape_article(article_url)
            if article:
                articles.append(article)
        
        return articles
    else:
        print(f"Failed to retrieve page. Status code: {response.status_code}")
        return None

def scrape_article(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        try:
            title = soup.find('h1', class_='title-detail').text.strip()
        except:
            title = ""

        try:
            keywords = soup.find('meta', attrs={'name': 'keywords'})['content']
        except:
            keywords = ""

        try:
            description = soup.find('meta', attrs={'name': 'description'})['content']
        except:
            description = ""

        try:
            content = ' '.join(p.text.strip() for p in soup.find_all('p', class_='Normal'))
        except:
            content = ""

        content_length = len(content)

        author_tag = soup.find('p', class_='author')
        author = author_tag.text.strip() if author_tag else ""

        return {
            'url': url,
            'title': title,
            'keywords': keywords,
            'description': description,
            'content': content,
            'len': content_length,
            'author': author
        }
    else:
        print(f"Failed to retrieve article {url}. Status code: {response.status_code}")
        return None
def main():
    base_url = 'https://vnexpress.net/suc-khoe-p'
    all_articles = []

    for page in range(1, 21):  
        page_url = f'{base_url}{page}'
        print(f"Scraping: {page_url}")
        articles = scrape_vnexpress_health(page_url)
        if articles:
            all_articles.extend(articles)

    if all_articles:
        df = pd.DataFrame(all_articles)
        df.to_csv('vnexpress_health_articles.csv', index=False, encoding='utf-8-sig')
        print('✅ Đã lưu thành công vào file CSV!')
    else:
        print('⚠️ Không có bài viết nào được lấy.')


if __name__ == '__main__':
    main()
    


Scraping: https://vnexpress.net/suc-khoe-p1
Scraping: https://vnexpress.net/suc-khoe-p2
Scraping: https://vnexpress.net/suc-khoe-p3
Scraping: https://vnexpress.net/suc-khoe-p4
Scraping: https://vnexpress.net/suc-khoe-p5
Scraping: https://vnexpress.net/suc-khoe-p6
Scraping: https://vnexpress.net/suc-khoe-p7
Scraping: https://vnexpress.net/suc-khoe-p8
Scraping: https://vnexpress.net/suc-khoe-p9
Scraping: https://vnexpress.net/suc-khoe-p10
Scraping: https://vnexpress.net/suc-khoe-p11
Scraping: https://vnexpress.net/suc-khoe-p12
Scraping: https://vnexpress.net/suc-khoe-p13
Scraping: https://vnexpress.net/suc-khoe-p14
Scraping: https://vnexpress.net/suc-khoe-p15
Scraping: https://vnexpress.net/suc-khoe-p16
Scraping: https://vnexpress.net/suc-khoe-p17
Scraping: https://vnexpress.net/suc-khoe-p18
Scraping: https://vnexpress.net/suc-khoe-p19
Scraping: https://vnexpress.net/suc-khoe-p20
✅ Đã lưu thành công vào file CSV!
