In [1]:
# -*- coding: utf-8 -*-
"""Knowledge-Based Systems and IEEE TKDE论文爬取.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1your_drive_link_here
"""

!pip install requests beautifulsoup4 pandas tqdm -q

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import re
from tqdm import tqdm
import json

class JournalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        self.results = []

    def fetch_page(self, url):
        try:
            response = self.session.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

    def fetch_json(self, url):
        try:
            response = self.session.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Error fetching JSON from {url}: {e}")
            return None

    def scrape_ieee_via_crossref(self):
        """使用CrossRef API获取IEEE TKDE论文"""
        print("通过CrossRef API爬取 IEEE Transactions on Knowledge and Data Engineering...")
        
        current_year = datetime.now().year
        start_year = current_year - 5
        articles = []
        
        base_url = "https://api.crossref.org/works"
        params = {
            'query.container-title': 'IEEE Transactions on Knowledge and Data Engineering',
            'filter': f'from-pub-date:{start_year}-01-01',
            'rows': 50,
            'select': 'DOI,title,author,abstract,published,ISSN'
        }
        
        try:
            response = self.session.get(base_url, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            for item in data.get('message', {}).get('items', []):
                try:
                    # 检查是否是IEEE TKDE期刊
                    issn_list = item.get('ISSN', [])
                    if not any('1041-4347' in issn for issn in issn_list if issn):
                        continue
                    
                    title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
                    
                    authors = []
                    for author in item.get('author', []):
                        given = author.get('given', '')
                        family = author.get('family', '')
                        authors.append(f"{given} {family}".strip())
                    authors_str = ', '.join(authors) if authors else 'N/A'
                    
                    doi = item.get('DOI', 'N/A')
                    abstract = item.get('abstract', 'N/A')
                    
                    # 获取年份
                    year = current_year
                    if item.get('published', {}).get('date-parts'):
                        year = item['published']['date-parts'][0][0]
                    
                    articles.append({
                        'title': title,
                        'authors': authors_str,
                        'doi': f"https://doi.org/{doi}" if doi != 'N/A' else 'N/A',
                        'abstract': abstract,
                        'journal': 'IEEE Transactions on Knowledge and Data Engineering',
                        'year': year,
                        'source': 'CrossRef API'
                    })
                    
                except Exception as e:
                    print(f"Error parsing IEEE item: {e}")
                    continue
            
            print(f"从CrossRef API获得 {len(articles)} 篇IEEE论文")
            
        except Exception as e:
            print(f"Error with CrossRef API: {e}")
            # 备用方案
            articles = self.get_sample_ieee_data()
        
        return articles

    def scrape_kbs_via_crossref(self):
        """使用CrossRef API获取Knowledge-Based Systems论文"""
        print("通过CrossRef API爬取 Knowledge-Based Systems...")
        
        current_year = datetime.now().year
        start_year = current_year - 5
        articles = []
        
        base_url = "https://api.crossref.org/works"
        params = {
            'query.container-title': 'Knowledge-Based Systems',
            'filter': f'from-pub-date:{start_year}-01-01',
            'rows': 50,
            'select': 'DOI,title,author,abstract,published,ISSN'
        }
        
        try:
            response = self.session.get(base_url, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            for item in data.get('message', {}).get('items', []):
                try:
                    # 检查是否是Knowledge-Based Systems期刊
                    issn_list = item.get('ISSN', [])
                    if not any('0950-7051' in issn for issn in issn_list if issn):
                        continue
                    
                    title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
                    
                    authors = []
                    for author in item.get('author', []):
                        given = author.get('given', '')
                        family = author.get('family', '')
                        authors.append(f"{given} {family}".strip())
                    authors_str = ', '.join(authors) if authors else 'N/A'
                    
                    doi = item.get('DOI', 'N/A')
                    abstract = item.get('abstract', 'N/A')
                    
                    # 获取年份
                    year = current_year
                    if item.get('published', {}).get('date-parts'):
                        year = item['published']['date-parts'][0][0]
                    
                    articles.append({
                        'title': title,
                        'authors': authors_str,
                        'doi': f"https://doi.org/{doi}" if doi != 'N/A' else 'N/A',
                        'abstract': abstract,
                        'journal': 'Knowledge-Based Systems',
                        'year': year,
                        'source': 'CrossRef API'
                    })
                    
                except Exception as e:
                    print(f"Error parsing KBS item: {e}")
                    continue
            
            print(f"从CrossRef API获得 {len(articles)} 篇KBS论文")
            
        except Exception as e:
            print(f"Error with CrossRef API for KBS: {e}")
            # 备用方案
            articles = self.get_sample_kbs_data()
        
        return articles

    def get_sample_ieee_data(self):
        """IEEE示例数据"""
        return [
            {
                'title': 'Machine Learning for Knowledge Discovery in Databases',
                'authors': 'John Smith, Jane Doe, Robert Johnson',
                'doi': 'https://doi.org/10.1109/TKDE.2023.1234567',
                'abstract': 'This paper presents a novel machine learning approach for knowledge discovery in large databases. The method combines deep learning with traditional data mining techniques.',
                'journal': 'IEEE Transactions on Knowledge and Data Engineering',
                'year': 2023,
                'source': 'Sample Data'
            },
            {
                'title': 'Deep Learning Approaches for Data Mining',
                'authors': 'Alice Brown, Bob Wilson',
                'doi': 'https://doi.org/10.1109/TKDE.2022.9876543',
                'abstract': 'A comprehensive survey of deep learning techniques applied to data mining problems, including classification, clustering, and association rule mining.',
                'journal': 'IEEE Transactions on Knowledge and Data Engineering',
                'year': 2022,
                'source': 'Sample Data'
            }
        ]

    def get_sample_kbs_data(self):
        """KBS示例数据"""
        return [
            {
                'title': 'Knowledge-Based Systems for Decision Support',
                'authors': 'Michael Chen, Sarah Lee',
                'doi': 'https://doi.org/10.1016/j.knosys.2023.123456',
                'abstract': 'This research explores the development of knowledge-based systems for enhanced decision support in complex environments.',
                'journal': 'Knowledge-Based Systems',
                'year': 2023,
                'source': 'Sample Data'
            },
            {
                'title': 'Artificial Intelligence in Knowledge Management',
                'authors': 'David Kim, Emily Wang',
                'doi': 'https://doi.org/10.1016/j.knosys.2022.654321',
                'abstract': 'An investigation into the role of artificial intelligence in modern knowledge management systems, focusing on natural language processing and machine learning techniques.',
                'journal': 'Knowledge-Based Systems',
                'year': 2022,
                'source': 'Sample Data'
            }
        ]

    def run(self):
        start_time = time.time()
        
        print("开始爬取两本期刊的论文信息...")
        print("使用CrossRef API进行爬取...")
        
        # 爬取IEEE
        ieee_articles = self.scrape_ieee_via_crossref()
        print(f"IEEE爬取完成，获得 {len(ieee_articles)} 篇论文")
        
        # 爬取KBS
        kbs_articles = self.scrape_kbs_via_crossref()
        print(f"KBS爬取完成，获得 {len(kbs_articles)} 篇论文")
        
        self.results = ieee_articles + kbs_articles
        
        # 保存结果
        if self.results:
            df = pd.DataFrame(self.results)
            filename = 'journal_articles.csv'
            df.to_csv(filename, index=False, encoding='utf-8', escapechar='\\')
            
            end_time = time.time()
            print(f"\n爬取完成！耗时: {end_time - start_time:.2f} 秒")
            print(f"总共爬取了 {len(self.results)} 篇论文")
            
            print("\n前5篇论文信息：")
            print(df.head().to_string())
            
            print("\n按期刊分类统计：")
            print(df['journal'].value_counts())
            
            return df, filename
        else:
            print("没有爬取到任何论文数据，使用示例数据演示")
            # 使用示例数据
            sample_data = self.get_sample_ieee_data() + self.get_sample_kbs_data()
            df = pd.DataFrame(sample_data)
            filename = 'journal_articles_sample.csv'
            df.to_csv(filename, index=False, encoding='utf-8')
            print("已生成示例数据文件")
            return df, filename

# 运行爬虫
scraper = JournalScraper()
df, filename = scraper.run()

# 显示数据基本信息
print("\n数据基本信息：")
print(f"数据形状: {df.shape}")
print(f"列名: {list(df.columns)}")
print(f"数据来源: {df['source'].value_counts()}")

# 显示一些统计信息
print("\n期刊分布:")
print(df['journal'].value_counts())

print("\n年份分布:")
print(df['year'].value_counts().sort_index())

# 提供下载链接
try:
    from google.colab import files
    files.download(filename)
    print(f"\n{filename} 文件已准备好下载")
except:
    print(f"\n请在Colab环境中运行以获得文件下载功能")
    print(f"数据已保存到 {filename}")

# 显示前几行数据
print("\n前3行数据预览:")
for i, row in df.head(3).iterrows():
    print(f"\n{i+1}. {row['title']}")
    print(f"   作者: {row['authors']}")
    print(f"   期刊: {row['journal']}")
    print(f"   年份: {row['year']}")

开始爬取两本期刊的论文信息...
使用CrossRef API进行爬取...
通过CrossRef API爬取 IEEE Transactions on Knowledge and Data Engineering...
从CrossRef API获得 50 篇IEEE论文
IEEE爬取完成，获得 50 篇论文
通过CrossRef API爬取 Knowledge-Based Systems...
从CrossRef API获得 50 篇KBS论文
KBS爬取完成，获得 50 篇论文

爬取完成！耗时: 18.86 秒
总共爬取了 100 篇论文

前5篇论文信息：
                                                                                          title                                                                                                                                 authors                                        doi abstract                                              journal  year        source
0           Efficient EMD-based Similarity Search via Batch Pruning and Incremental Computation                                                                               Yu Chen, Yong Zhang, Jin Wang, Jiacheng Wu, Chunxiao Xing  https://doi.org/10.1109/tkde.2021.3100566      N/A  IEEE Transactions on Knowledge and Data Engineering  2021  CrossRef API