In [2]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

def get_chinadaily_news():
    base_url = 'https://www.chinadaily.com.cn/business'
    sections = [
        '/economy',
        '/companies',
        '/biz_industries',
        '/tech'
    ]
    all_news = []
    for section in sections:
        url = base_url + section
        try:
            print(f"Scraping section: {section}")
            response = requests.get(url)
            response.raise_for_status()
            content = response.content
            parsed_content = bs(content, 'html.parser')
            news_items = parsed_content.find_all('div', class_=lambda x: x and (
                'mb10' in x or 
                'box' in x or 
                'Block' in x or 
                'listBox' in x or 
                'leftBox' in x
            ))
            print(f"Found {len(news_items)} potential news items in {section}")
            for item in news_items:
                unit_info = {}
                title_elem = item.find(['h3', 'h4', 'h2'])
                if title_elem:
                    title_link = title_elem.find('a', href=True)
                    if title_link:
                        news_title = title_link.text.strip()
                        news_link = title_link['href'].strip()
                        if news_link.startswith('//'):
                            news_link = 'https:' + news_link
                        elif not news_link.startswith('http'):
                            news_link = base_url + news_link
                        unit_info['title'] = news_title
                        unit_info['link'] = news_link
                        unit_info['section'] = section.replace('/', '')
                        all_news.append(unit_info)
            print(f"Added {len(all_news)} total items so far")        
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
    if all_news:
        df = pd.DataFrame(all_news, columns=['title', 'link', 'section'])
        df = df.drop_duplicates(subset=['link'])
        print(f"Total unique news items found: {len(df)}")
        df.to_csv('Chinadaily_All_News.csv', index=False)
        return df
    else:
        print("No news found")
        return pd.DataFrame()
if __name__ == '__main__':
    news_df = get_chinadaily_news()
    if not news_df.empty:
        print(f"Successfully scraped {len(news_df)} news items.")

Scraping section: /economy
Found 20 potential news items in /economy
Added 20 total items so far
Scraping section: /companies
Found 20 potential news items in /companies
Added 40 total items so far
Scraping section: /biz_industries
Found 20 potential news items in /biz_industries
Added 60 total items so far
Scraping section: /tech
Found 20 potential news items in /tech
Added 80 total items so far
Total unique news items found: 80
Successfully scraped 80 news items.
