# 关键词表

In [None]:
import requests, json, re   # 建立关键词表
import pandas as pd

def extract_all_keywords(df, n=5, batch_size=10, timeout=360):
    """
    整表批量提取关键词，返回包含wos_id和关键词的DataFrame
    """
    total = len(df)
    df = df.copy()
    
    # 创建结果DataFrame，包含wos_id和keywords列
    result_df = pd.DataFrame({
        'wos_id': df['wos_id'].values,  # 直接从原df获取wos_id
        'keywords': [[] for _ in range(total)]  # 预分配空列表
    })

    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        batch = df.iloc[start:end]

        # 组装大 prompt
        parts = []
        for _, row in batch.iterrows():
            abstract = str(row.get('abstract', ''))[:600]
            title    = str(row.get('title', ''))
            parts.append(f"标题：{title}\n摘要：{abstract}")

        big_prompt = (
            f"请给以下 {len(batch)} 篇论文各提取 {n} 个关键词，按篇号顺序返回，格式：\n"
            + "\n".join([f"篇{i+1}：[词1, 词2, ...]" for i in range(len(batch))])
            + "\n\n-----\n" + "\n-----\n".join(parts)
        )

        api_password = "cyjdtVYXSGWgwiUdnLMs:DvKIMQbkHgKlYljNcbhN"
        url  = "https://spark-api-open.xf-yun.com/v2/chat/completions"
        headers = {
            'Authorization': f'Bearer {api_password}',
            'content-type': "application/json"
        }
        body = {
            "model": "x1",
            "user": "keyword_extractor",
            "messages": [{"role": "user", "content": big_prompt}],
            "stream": False
        }

        try:
            resp = requests.post(url, json=body, headers=headers, timeout=timeout)
            resp.raise_for_status()
            text = resp.json()['choices'][0]['message']['content']
            matches = re.findall(r'\[([^\]]+)\]', text)

            if len(matches) == len(batch):
                keywords = []
                for m in matches:
                    # 统一双引号 & 去掉多余空格/换行
                    m = m.replace('"', '"').replace('"', '"').replace("'", '"').strip()
                    # 先尝试提取双引号内内容
                    words = re.findall(r'"([^"]+)"', m)
                    if not words:
                        # 兜底：按逗号切分并去空
                        words = [w.strip() for w in m.split(',') if w.strip()]
                    keywords.append(words[:n])

                # 将关键词写入结果DataFrame
                for i, kw in enumerate(keywords):
                    result_df.at[start + i, 'keywords'] = kw
                    
                print(f"批次 {start}-{end-1} 处理成功，提取了 {len(batch)} 篇论文的关键词")
                
            else:
                print(f"批次 {start}-{end-1} 解析数量不符（期望{len(batch)}，实际{len(matches)}），已置空")
                # 该批次保持空列表
                
        except Exception as e:
            print(f"批次 {start}-{end-1} 失败：{e}（已置空）")
            # 该批次保持空列表

    return result_df

In [None]:
# 假设你的原始DataFrame包含wos_id、title、abstract列
# 调用函数
results_df = extract_all_keywords(df, n=5, batch_size=10)

# 查看结果
print("处理完成！结果DataFrame：")
print(results_df.head())

print("\nDataFrame信息：")
print(f"总行数：{len(results_df)}")
print(f"列名：{results_df.columns.tolist()}")

# 查看包含关键词的论文数量
non_empty_keywords = results_df[results_df['keywords'].apply(len) > 0]
print(f"成功提取关键词的论文数量：{len(non_empty_keywords)}")

# 保存结果到文件
results_df.to_csv('results_df.csv', index=False, encoding='utf-8-sig')
print("结果已保存到 'results_df.csv'")

In [None]:
# 直接查看DataFrame
test_10 = df.head(10).copy()
kw_results_df = extract_all_keywords(test_10, n=5, batch_size=10, timeout=360)
test_10['ai_keywords'] = kw_results_df['keywords'].values

# 年龄表

In [None]:
# 在results_df中添加average_age列
# 确保有关键词年龄字典
keyword_age_dict = dict(zip(words_age_df['keyword'], words_age_df['first_occurrence_year']))

# 计算每篇论文的关键词平均年龄
average_ages = []

for index, row in results_df.iterrows():
    year = row['year']
    keywords = row['ai_keywords']
    
    keyword_ages = []
    for keyword in keywords:
        if keyword in keyword_age_dict:
            first_year = keyword_age_dict[keyword]
            age = year - first_year
            keyword_ages.append(age)
    
    # 计算平均年龄（如果没有有效关键词则为NaN）
    if keyword_ages:
        avg_age = sum(keyword_ages) / len(keyword_ages)
    else:
        avg_age = None
    
    average_ages.append(avg_age)

# 将平均年龄添加到results_df数据框
results_df['average_age'] = average_ages

# 显示结果
print(f"平均年龄范围: {results_df['average_age'].min():.2f} - {results_df['average_age'].max():.2f} 年")
print(f"整体平均年龄: {results_df['average_age'].mean():.2f} 年")
print("\n前5篇论文的平均年龄:")
results_df[['journal', 'year', 'average_age']].head()

In [None]:
# 第四步：按期刊分组计算新颖性指标
# 首先计算每篇论文的新颖性得分
novelty_scores = []

for index, row in results_df.iterrows():
    year = row['year']
    avg_age = row['average_age']
    
    # 使用修正后的新颖性公式：Nₚ = 1 - [平均年龄] / (yearₚ - 2010 + 1)
    if pd.notna(avg_age):
        max_possible_age = year - 2010 + 1  # 归一化分母
        novelty_score = 1 - (avg_age / max_possible_age)
        # 确保得分在0-1范围内
        novelty_score = max(0, min(1, novelty_score))
    else:
        novelty_score = None
    
    novelty_scores.append(novelty_score)

# 将新颖性得分添加到results_df
results_df['novelty_score'] = novelty_scores

# 按期刊分组计算统计量
# 假设期刊信息在 'source' 列，如果不是请替换为正确的列名
journal_novelty = results_df.groupby('source')['novelty_score'].agg([
    ('mean_novelty', 'mean'),
    ('median_novelty', 'median'),
    ('std_novelty', 'std'),
    ('paper_count', 'count')
]).reset_index()

# 按平均新颖性排序
journal_novelty = journal_novelty.sort_values('mean_novelty', ascending=False)

# 显示结果
print("各期刊新颖性统计:")
print(journal_novelty)

print("\n整体新颖性统计:")
print(f"所有论文平均新颖性: {results_df['novelty_score'].mean():.3f}")
print(f"新颖性得分范围: {results_df['novelty_score'].min():.3f} - {results_df['novelty_score'].max():.3f}")

# 保存结果
journal_novelty.to_csv('data/analysis/journal_novelty_scores.csv', index=False, encoding='utf-8')

# 显示新颖性最高的期刊
print("\n新颖性最高的前3个期刊:")
print(journal_novelty.head(3))