In [11]:
pip install pandas jieba

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [12]:
import pandas as pd
import json
import re

# 1. 读取 JSON 文件
# 注意：你的文件看起来是多个对象的数组，但格式可能不标准。我们逐行读取。
with open('唐诗三百首.json', 'r', encoding='utf-8') as f:
    # 如果文件是标准的 JSON 数组，可以直接用 json.load(f)
    # 如果不标准，可以尝试逐行读取
    data = []
    for line in f:
        # 使用更健壮的方式清理引号
        line_clean = line.strip().replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")
        if line_clean: # 跳过空行
            try:
                data.append(json.loads(line_clean))
            except json.JSONDecodeError:
                print(f"跳过无法解析的行: {line_clean[:50]}...")
                continue

# 2. 转换为 DataFrame
df = pd.DataFrame(data)
print(f"原始数据量: {len(df)}")
print(df.head())

# 3. 数据清洗
# 清理正文：将段落列表合并成一个字符串
df['content'] = df['paragraphs'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# 简单的文本清理，去除空格和特殊字符
df['content'] = df['content'].str.replace(r'\s+', ' ', regex=True).str.strip()

# 4. 手动或半自动标注时期和题材
# 这是最耗时但最关键的一步。这里提供一个半自动化的思路。

# 时期映射字典 (需要你根据历史知识扩充)
period_mapping = {
    '骆宾王': '初唐',
    '陳子昂': '初唐',
    '明皇帝': '盛唐',
    # ... 继续添加其他诗人
}
df['period'] = df['author'].map(period_mapping)

# 题材映射 (可以从 tags 里提取，或者根据关键词)
# 例如，如果 tags 包含 "咏物"，则题材可以是 "咏物诗"
def infer_genre(tags_list):
    genre_keywords = {
        '山水': '山水田园诗',
        '田园': '山水田园诗',
        '咏物': '咏物诗',
        '边塞': '边塞诗',
        '送别': '送别诗',
        '怀古': '怀古诗',
        '爱情': '爱情诗',
        '政治': '政治诗'
    }
    for tag in tags_list:
        for keyword, genre in genre_keywords.items():
            if keyword in tag:
                return genre
    return '其他' # 默认值

df['genre'] = df['tags'].apply(infer_genre)

# 检查缺失值
print(f"时期缺失: {df['period'].isnull().sum()}")
print(f"题材缺失: {df['genre'].isnull().sum()}")

# 5. 保存清洗后的数据
df.to_csv('cleaned_tang_poems.csv', index=False, encoding='utf-8-sig')
print("数据清洗完成，已保存为 'cleaned_tang_poems.csv'")

跳过无法解析的行: [...
跳过无法解析的行: {...
跳过无法解析的行: "author": "駱賓王",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "西陸蟬聲唱，南冠客思侵。",...
跳过无法解析的行: "那堪玄鬢影，來對白頭吟。",...
跳过无法解析的行: "露重飛難進，風多響易沈。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "咏物",...
跳过无法解析的行: "咏物诗",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "在嶽詠蟬",...
跳过无法解析的行: "id": "c65539db-4e2b-4ce4-a22b-563b6ef3f4f1"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "陳子昂",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "前不見古人，後不見來者。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "隋・唐・五代",...
跳过无法解析的行: "八年级下册(课外)",...
跳过无法解析的行: "伤怀",...
跳过无法解析的行: "初中古诗",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "登幽州臺歌",...
跳过无法解析的行: "id": "c244a5b4-0ed0-48fe-8694-95309acac184"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "明皇帝",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "夫子何爲者？栖栖一代中。",...
跳过无法解析的行: "地猶鄹氏邑，宅即魯王宮。",...
跳过无法解析的行: "歎鳳嗟身否，傷麟怨道窮。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行

跳过无法解析的行: "隋・唐・五代",...
跳过无法解析的行: "七言律诗",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "西塞山懷古",...
跳过无法解析的行: "id": "80beb8cc-8338-44d4-8b96-7f9d32133c97"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "劉禹錫",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "朱雀橋邊野草花，烏衣巷口夕陽斜。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "怀古",...
跳过无法解析的行: "七言绝句",...
跳过无法解析的行: "带有地名",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "金陵五題 烏衣巷",...
跳过无法解析的行: "id": "4fa92777-9bd6-45ca-9272-bac30459cf5e"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "劉禹錫",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "新妝面面下朱樓，深鎖春光一院愁。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "孤独",...
跳过无法解析的行: "春天",...
跳过无法解析的行: "女子",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "和樂天春詞",...
跳过无法解析的行: "id": "297dd100-6aab-4685-a52d-983fb0999c7a"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "孟郊",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "梧桐相待老，鴛鴦會雙死。",...
跳过无法解析的行: "貞女貴狥夫，捨生亦如此。",...
跳过无法解

跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "五言绝句",...
跳过无法解析的行: "写人",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "訪羊尊師",...
跳过无法解析的行: "id": "162eb552-a496-4979-87e2-e441847d4e6f"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "張喬",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "調角斷清秋，征人倚戍樓。",...
跳过无法解析的行: "春風對青塚，白日落梁州。",...
跳过无法解析的行: "大漢無兵阻，窮邊有客遊。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "生活",...
跳过无法解析的行: "五言律诗",...
跳过无法解析的行: ],...
跳过无法解析的行: "title": "書邊事",...
跳过无法解析的行: "id": "8324db66-b75a-40ff-9469-cba276d2f88f"...
跳过无法解析的行: },...
跳过无法解析的行: {...
跳过无法解析的行: "author": "李白",...
跳过无法解析的行: "paragraphs": [...
跳过无法解析的行: "金樽清酒斗十千，玉盤珍羞直萬錢。",...
跳过无法解析的行: "停杯投筯不能食，拔劒四顧心茫然。",...
跳过无法解析的行: "欲渡黃河冰塞川，將登太行雪滿山。",...
跳过无法解析的行: "閑來垂釣碧溪上，忽復乘舟夢日邊。",...
跳过无法解析的行: "行路難，行路難，多岐路，今安在。",...
跳过无法解析的行: ],...
跳过无法解析的行: "tags": [...
跳过无法解析的行: "唐诗三百首",...
跳过无法解析的行: "黄河",...
跳过无法解析的行: "励志",...
跳过无法解析的行: "友情",...
跳过无法解析的行: "八年级下册(课内)",...
跳过无法解析的行: "初中古诗",...
跳过无法解析的行: "乐府",...
跳过无法解析的行: "宴饮",.

KeyError: 'paragraphs'

In [None]:
import pandas as pd
from collections import Counter
import jieba

# 读取清洗后的数据
df = pd.read_csv('cleaned_tang_poems.csv')

# 定义停用词列表 (可以从网上下载中文停用词表)
stopwords = set(['，', '。', '？', '！', '的', '了', '在', '是', '我', '你', '他', '有', '和', '就', '不', '人', '都', '一', '上', '也', '很', '到', '说', '要', '去', '出', '会', '可', '以', '而', '之', '其', '为', '于', '与', '此', '何', '如', '为', '云', '曰', '兮', '矣', '也', '乎', '乃', '呜呼', '噫', '哉'])

def process_text(text):
    words = jieba.lcut(text)
    # 过滤掉单字和停用词
    words = [w for w in words if len(w) > 1 and w not in stopwords and not w.isspace()]
    return words

# 为所有诗分词
df['words'] = df['content'].apply(process_text)

# 按时期分组统计词频
for period, group in df.groupby('period'):
    all_words = [word for sublist in group['words'] for word in sublist]
    word_freq = Counter(all_words).most_common(20) # 取前20
    print(f"\n【{period}】高频词:")
    for word, freq in word_freq:
        print(f"{word}: {freq}")

In [None]:
import pandas as pd
from collections import Counter
import jieba

# 读取清洗后的数据
df = pd.read_csv('cleaned_tang_poems.csv')

# 定义停用词列表 (可以从网上下载中文停用词表)
stopwords = set(['，', '。', '？', '！', '的', '了', '在', '是', '我', '你', '他', '有', '和', '就', '不', '人', '都', '一', '上', '也', '很', '到', '说', '要', '去', '出', '会', '可', '以', '而', '之', '其', '为', '于', '与', '此', '何', '如', '为', '云', '曰', '兮', '矣', '也', '乎', '乃', '呜呼', '噫', '哉'])

def process_text(text):
    words = jieba.lcut(text)
    # 过滤掉单字和停用词
    words = [w for w in words if len(w) > 1 and w not in stopwords and not w.isspace()]
    return words

# 为所有诗分词
df['words'] = df['content'].apply(process_text)

# 按时期分组统计词频
for period, group in df.groupby('period'):
    all_words = [word for sublist in group['words'] for word in sublist]
    word_freq = Counter(all_words).most_common(20) # 取前20
    print(f"\n【{period}】高频词:")
    for word, freq in word_freq:
        print(f"{word}: {freq}")

In [None]:
from gensim import corpora, models

# 准备 LDA 需要的数据
dictionary = corpora.Dictionary(df['words'])
corpus = [dictionary.doc2bow(text) for text in df['words']]

# 训练 LDA 模型，假设我们找 5 个主题
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# 打印每个主题的关键词
for idx, topic in lda_model.print_topics(-1):
    print(f"主题 {idx}: {topic}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# 1. 将分词结果合并为字符串
df['processed_content'] = df['words'].apply(' '.join)

# 2. 按诗人分组，合并他所有的诗
poet_corpus = df.groupby('author')['processed_content'].apply(' '.join).reset_index()

# 3. 计算 TF-IDF 向量
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(poet_corpus['processed_content'])

# 4. 计算诗人之间的余弦相似度
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. 构建网络图
G = nx.Graph()
poets = poet_corpus['author'].tolist()

# 添加节点 (诗人)
for poet in poets:
    G.add_node(poet)

# 添加边 (相似度大于阈值)
threshold = 0.1 # 这个阈值可以调整
for i in range(len(poets)):
    for j in range(i+1, len(poets)):
        if cosine_sim[i, j] > threshold:
            G.add_edge(poets[i], poets[j], weight=cosine_sim[i, j])

# 保存为 Gephi 可读的格式
nx.write_gexf(G, 'poet_similarity_network.gexf')
print("诗人关系网络已保存为 'poet_similarity_network.gexf'，可用 Gephi 打开进行可视化。")

In [None]:
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>唐诗文风变化图谱</title>
    <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
</head>
<body>
    <div id="timeline" style="width: 1000px;height:600px;"></div>
    <script>
        // 这里需要你将 Python 分析出的“每个时期各类题材的数量”转换成如下格式的 JavaScript 数据
        // 例如： periods = ['初唐', '盛唐', '中唐', '晚唐']
        // seriesData = [ {name: '山水田园诗', type: 'line', data: [10, 25, 15, 5]}, ... ]
        
        // 初始化图表
        var chart = echarts.init(document.getElementById('timeline'));
        var option = {
            title: { text: '唐诗题材流变' },
            tooltip: { trigger: 'axis' },
            legend: { data: ['山水田园诗', '边塞诗', '咏物诗', '送别诗'] }, // 你的题材列表
            xAxis: { type: 'category', data: ['初唐', '盛唐', '中唐', '晚唐'] }, // 你的时期列表
            yAxis: { type: 'value' },
            series: seriesData // 这里放入你从 Python 计算好的数据
        };
        chart.setOption(option);
    </script>
</body>
</html>

In [None]:
# 计算时期-题材交叉表
period_genre_count = pd.crosstab(df['period'], df['genre'])
print(period_genre_count.to_json(force_ascii=False))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 生成全唐诗词云
all_text = ' '.join(df['content'])
wordcloud = WordCloud(font_path='simhei.ttf', # 指定中文字体路径
                      background_color='white',
                      width=800, height=600).generate(all_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('wordcloud.png') # 保存图片
plt.show()

In [None]:
tang_poetry_project/
├── 唐诗三百首.json (原始数据)
├── data_cleaning.py
├── data_analysis.py
├── cleaned_tang_poems.csv
├── poet_similarity_network.gexf
├── wordcloud.png
├── timeline.html (单独的时间轴页面)
└── index.html (最终集大成的报告页面)