In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Constants

In [28]:
metadata_file = './annotations'
output_path = './out'
source_text_path = './古文原文'

function_words = ['也', '哉', '乎', '兮', '夫', '焉', '耶', '者', '矣', '耳', '邪']
punctuations = ['，', '。', '”', '！', '？', '；', '：', '……', '——', '、']

### Read metadata file

In [29]:
data = []
current_genre = None
line_number = 0

with open(metadata_file, "r") as f:
    for line in f:
        line_number += 1
        line = line.strip()
        if not line:  # stop reading at the first blank line
            break
        if line.startswith("-") and line.endswith("-"):  # get genre
            current_genre = line.strip("-")
        elif current_genre != 'None':
            line = line.split(' ')
            if len(line) != 3: # detect invalid lines
                print(f'invalid input format at line {line_number}')
            else: # get info from line
                title = line[0]
                author = line[1]
                time = line[2]
                data.append([title, current_genre, author, time])

df = pd.DataFrame(data, columns=["title", "genre", "author", "time"])

In [30]:
df[:5]

Unnamed: 0,title,genre,author,time
0,尚书,史书,不详,先秦
1,左传,史书,左丘明,春秋
2,战国策,史书,刘向,汉
3,史记,史书,司马迁,汉
4,汉书,史书,班固,汉


### Read text

In [None]:
def get_content(title):
    content_path = os.path.join(source_text_path, title)
    
    if not os.path.exists(content_path): # check folder exists
        print(f"folder '{content_path}' not found.")
        return ""
    
    all_text = []
    for root, _, files in os.walk(content_path):
        for file in files:
            if file == "text.txt": 
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        all_text.append(f.read())
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return "\n".join(all_text) 

df["content"] = df["title"].apply(get_content)

In [32]:
df[:5]

Unnamed: 0,title,genre,author,time,content
0,尚书,史书,不详,先秦,武王戎车三百两，虎贲三百人，与商战于牧野，作《牧誓》。\n时甲子昧爽，王朝至于商郊牧野，乃誓...
1,左传,史书,左丘明,春秋,【经】\n十有四年春，卫公叔戌来奔。\n卫赵阳出奔宋。\n二月辛巳，楚公子结、陈公孙佗人帅师...
2,战国策,史书,刘向,汉,齐明谓公叔曰： 齐逐几瑟，楚善之。今楚欲善齐甚，公何不令齐王谓楚王： 王为我逐几瑟以穷之。 ...
3,史记,史书,司马迁,汉,汉兴，接秦之坏，丈夫从军旅，老弱转粮饷，作业剧而财匮，自天子不能具钧驷，而将相或乘牛车，齐民...
4,汉书,史书,班固,汉,昔仲尼没而微言绝，七十子丧而大义乖。故《春秋》分为五，《诗》分为四，《易》有数家之传。战国从...
