https://china.huanqiu.com/


 
### Request URL:
https://china.huanqiu.com/api/list?node=%22/e3pmh1nnq/e3pmh1obd%22,%22/e3pmh1nnq/e3pn61c2g%22,%22/e3pmh1nnq/e3pn6eiep%22,%22/e3pmh1nnq/e3pra70uk%22,%22/e3pmh1nnq/e5anm31jb%22,%22/e3pmh1nnq/e7tl4e309%22&offset=24&limit=24
Request Method:
GET


### Payload
node=%22/e3pmh1nnq/e3pmh1obd%22,%22/e3pmh1nnq/e3pn61c2g%22,%22/e3pmh1nnq/e3pn6eiep%22,%22/e3pmh1nnq/e3pra70uk%22,%22/e3pmh1nnq/e5anm31jb%22,%22/e3pmh1nnq/e7tl4e309%22&offset=24&limit=24


### 응답 데이터 일부 예시 입니다. 

{
    "list": [{
    	"aid": "4IiO9qGa9i5",
    	"title": "最新版肉制品生产监督检查操作指南来了",
    	"summary": "指导基层市场监管人员熟练掌握肉制品生产企业检查要点和检查方法，切实提升监督检查水平，守稳筑牢食品安全底线。",
    	"addltype": "normal",
    	"typedata":{"audio":{"members":[]},"gallery":{"members":[]},"video":{"members":[]}},
    	"source" :{"name":"央视新闻客户端","url":"https:\/\/content-static.cctvnews.cctv.com\/snow-book\/index.html?item_id=12991606494210524639&toc_style_id=feeds_default&track_id=026217CF-94F2-433D-8F1C-91F327CDA784_743338808688&share_to=wechat"},
    	"ext_displaytime": "",
    	"ext_defertime":"",
    	"ctime": "1721647769902",
    	"xtime": "1721647769902",
    	"cover" : "",
    	"host" : "china.huanqiu.com",
		"ext-serious" : "1",
		"ext-weight" : "50"
    },{
    	"aid": "4IiNXLfI4MG",
    	"title": "人民观察｜经典与创新碰撞出吉林文旅融合“新火花”",
    	"summary": "盛夏傍晚时分，还未走到长影世纪城“山海奇妙夜”的大门，记者就被景区门前的一排花灯所吸引。",
    	"addltype": "normal",
    	"typedata":{"audio":{"members":[]},"gallery":{"members":[{"desc":null,"height":566,"id":"a1i9vr_759728","mime":"image\/jpg","size":159.71,"



* 판다스 데이터프레임으로 여러 페이지의 뉴스 기사를 수집할 수 있도록 작성합니다.

In [None]:
import requests
import pandas as pd
import time
import logging
from requests.exceptions import RequestException
from typing import Dict, List, Optional

# Constants
BASE_URL = "https://china.huanqiu.com/api/list"
NODES = "\"/e3pmh1nnq/e3pmh1obd\",\"/e3pmh1nnq/e3pn61c2g\",\"/e3pmh1nnq/e3pn6eiep\",\"/e3pmh1nnq/e3pra70uk\",\"/e3pmh1nnq/e5anm31jb\",\"/e3pmh1nnq/e7tl4e309\""
LIMIT = 24
MAX_RETRIES = 3
RETRY_DELAY = 5

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HuanqiuScraper:
    def __init__(self):
        self.session = requests.Session()

    def fetch_data(self, offset: int) -> Optional[Dict]:
        params = {
            'node': NODES,
            'offset': offset,
            'limit': LIMIT
        }
        for attempt in range(MAX_RETRIES):
            try:
                response = self.session.get(BASE_URL, params=params, timeout=10)
                response.raise_for_status()
                return response.json()
            except RequestException as e:
                logger.warning(f"Attempt {attempt + 1} failed: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(RETRY_DELAY)
                else:
                    logger.error(f"Max retries reached. Skipping offset {offset}")
        return None

    @staticmethod
    def parse_articles(data: Dict) -> List[Dict]:
        if not data or 'list' not in data:
            return []
        
        return [{
            'aid': article.get('aid', ''),
            'title': article.get('title', ''),
            'summary': article.get('summary', ''),
            'source_name': article.get('source', {}).get('name', ''),
            'source_url': article.get('source', {}).get('url', ''),
            'ctime': article.get('ctime', ''),
            'xtime': article.get('xtime', ''),
            'host': article.get('host', ''),
            'ext_serious': article.get('ext-serious', ''),
            'ext_weight': article.get('ext-weight', '')
        } for article in data['list']]

    def collect_articles(self, pages: int) -> List[Dict]:
        all_articles = []
        for i in range(pages):
            offset = i * LIMIT
            data = self.fetch_data(offset)
            if data:
                articles = self.parse_articles(data)
                all_articles.extend(articles)
            else:
                logger.warning(f"No data retrieved for page {i+1}")
        return all_articles

def save_to_csv(df: pd.DataFrame) -> str:
    filename = f"huanqiu_articles_{time.strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    return filename

def main():
    scraper = HuanqiuScraper()
    try:
        pages_to_collect = int(input("Enter the number of pages to collect: "))
        articles_data = scraper.collect_articles(pages_to_collect)

        if not articles_data:
            logger.error("No articles collected. Exiting.")
            return

        df = pd.DataFrame(articles_data)
        logger.info(f"Collected {len(df)} articles.")
        
        filename = save_to_csv(df)
        logger.info(f"Data saved to {filename}")

        logger.info("\nFirst few rows of the collected data:")
        df.head()

    except ValueError:
        logger.error("Invalid input. Please enter a valid number of pages.")
    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import glob
import os
from datetime import datetime

def load_csv_files(directory='.'):
    # 지정된 디렉토리에서 "huanqiu_articles_"로 시작하는 모든 CSV 파일 찾기
    csv_files = glob.glob(os.path.join(directory, 'huanqiu_articles_*.csv'))
    
    if not csv_files:
        print("No CSV files found.")
        return None

    # 모든 CSV 파일을 하나의 데이터프레임으로 로드
    df_list = []
    for file in csv_files:
        df = pd.read_csv(file)
        df['file_name'] = os.path.basename(file)  # 파일 이름 추가
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [None]:
df = load_csv_files()
if df is not None:
    print("Data loaded successfully.")
    print(f"Shape of the combined dataframe: {df.shape}")
    
    print("\nAnalysis Results:")
    analyze_data(df)
    
    # 결과 저장
    output_file = f"huanqiu_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"\nCombined data saved to {output_file}")


In [None]:
print(f"Total number of articles: {len(df)}")

# 중복 제거 후 고유한 기사 수
unique_articles = df.drop_duplicates(subset=['aid'])
print(f"Number of unique articles: {len(unique_articles)}")

# 가장 많은 기사를 가진 상위 5개 출처
top_sources = df['source_name'].value_counts().head()
print("\nTop 5 sources:")
print(top_sources)

# 시간대별 기사 수
df['datetime'] = pd.to_datetime(df['ctime'].astype(float), unit='ms')
df['date'] = df['datetime'].dt.date
articles_by_date = df['date'].value_counts().sort_index()
print("\nArticles by date:")
print(articles_by_date)

In [None]:
# 가장 많이 등장하는 키워드 (제목 기준)
df['title'] = df['title'].fillna('')  # NaN 값을 빈 문자열로 대체
df['title_words'] = df['title'].apply(lambda x: x.split() if isinstance(x, str) else [])
all_words = [word for words in df['title_words'] for word in words]
word_counts = pd.Series(all_words).value_counts()
print("\nTop 10 keywords in titles:")
print(word_counts.head(10))

In [None]:
all_words