In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Constants

In [59]:
metadata_file = './annotations'
output_path = './out'
source_text_path = './古文原文'

function_words = ['也', '哉', '乎', '兮', '夫', '焉', '耶', '者', '矣', '耳', '邪'] # some classical Chinese modal particals
punctuations = ['，', '。', '”', '！', '？', '；', '：', '……', '——'] # modern punctuations that can be used to end a clause 

### Read metadata file

In [60]:
data = []
current_genre = None
line_number = 0

with open(metadata_file, "r") as f:
    for line in f:
        line_number += 1
        line = line.strip()
        if not line:  # stop reading at the first blank line
            break
        if line.startswith("-") and line.endswith("-"):  # get genre
            current_genre = line.strip("-")
        elif current_genre != 'None':
            line = line.split(' ')
            if len(line) != 3: # detect invalid lines
                print(f'invalid input format at line {line_number}')
            else: # get info from line
                title = line[0]
                author = line[1]
                time = line[2]
                data.append([title, current_genre, author, time])

df = pd.DataFrame(data, columns=["title", "genre", "author", "time"])

In [61]:
df[:5]

Unnamed: 0,title,genre,author,time
0,尚书,史书,不详,先秦
1,左传,史书,左丘明,春秋
2,战国策,史书,刘向,汉
3,史记,史书,司马迁,汉
4,汉书,史书,班固,汉


### Read text

In [62]:
def get_content(title):
    content_path = os.path.join(source_text_path, title)
    
    if not os.path.exists(content_path): # check folder exists
        print(f"folder '{content_path}' not found.")
        return ""
    
    all_text = []
    for root, _, files in os.walk(content_path):
        for file in files:
            if file == "text.txt": 
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        all_text.append(f.read())
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return "\n".join(all_text) 

df["content"] = df["title"].apply(get_content)

In [63]:
df[:5]

Unnamed: 0,title,genre,author,time,content
0,尚书,史书,不详,先秦,武王戎车三百两，虎贲三百人，与商战于牧野，作《牧誓》。\n时甲子昧爽，王朝至于商郊牧野，乃誓...
1,左传,史书,左丘明,春秋,【经】\n十有四年春，卫公叔戌来奔。\n卫赵阳出奔宋。\n二月辛巳，楚公子结、陈公孙佗人帅师...
2,战国策,史书,刘向,汉,齐明谓公叔曰： 齐逐几瑟，楚善之。今楚欲善齐甚，公何不令齐王谓楚王： 王为我逐几瑟以穷之。 ...
3,史记,史书,司马迁,汉,汉兴，接秦之坏，丈夫从军旅，老弱转粮饷，作业剧而财匮，自天子不能具钧驷，而将相或乘牛车，齐民...
4,汉书,史书,班固,汉,昔仲尼没而微言绝，七十子丧而大义乖。故《春秋》分为五，《诗》分为四，《易》有数家之传。战国从...


### Get stats

In [64]:
def analyze_text(text):
    total_chars = len(text) 
    function_word_counts = {word: 0 for word in function_words}
    function_word_end_counts = {word: 0 for word in function_words}
    total_function_words = 0
    total_function_words_at_end = 0
    
    # iterate over text
    for i, char in enumerate(text):
        if char in function_words: # if is a function word
            function_word_counts[char] += 1
            total_function_words += 1
            # check if it's followed by a punctuation mark (i.e., at the end of a clause)
            if i < len(text) - 1 and text[i + 1] in punctuations:
                function_word_end_counts[char] += 1
                total_function_words_at_end += 1
    
    # frequency of function words
    function_words_frequency_all = (
        total_function_words_at_end / total_chars if total_chars > 0 else 0.0
    )

    # fraction of function words at clause endings
    fraction_function_words_at_end_all = (
        total_function_words_at_end / total_function_words if total_function_words > 0 else 0.0
    )

    # frequency of each function words
    function_words_frequency_each = (
        (function_word_end_counts[word] / total_chars) if total_chars > 0 else 0.0
        for word in function_words
    )

    # fraction of each function word at clause endings
    fraction_function_words_at_end_each = [
        (function_word_end_counts[word] / function_word_counts[word]) if function_word_counts[word] > 0 else 0.0
        for word in function_words
    ]

    result = [total_chars, total_function_words, function_words_frequency_all, fraction_function_words_at_end_all]
    result += [function_word_counts[word] for word in function_words]
    result += function_words_frequency_each
    result += fraction_function_words_at_end_each

    return result

new_columns = ["total_chars", "total_function_words", "total_function_words_frequency", "fraction_function_words_at_end"]
new_columns += [f"{word}_count" for word in function_words]
new_columns += [f"{word}_frequency" for word in function_words]
new_columns += [f"{word}_fraction_at_end" for word in function_words]

df[new_columns] = df["content"].apply(lambda text: pd.Series(analyze_text(text)))

In [65]:
df.columns

Index(['title', 'genre', 'author', 'time', 'content', 'total_chars',
       'total_function_words', 'total_function_words_frequency',
       'fraction_function_words_at_end', '也_count', '哉_count', '乎_count',
       '兮_count', '夫_count', '焉_count', '耶_count', '者_count', '矣_count',
       '耳_count', '邪_count', '也_frequency', '哉_frequency', '乎_frequency',
       '兮_frequency', '夫_frequency', '焉_frequency', '耶_frequency',
       '者_frequency', '矣_frequency', '耳_frequency', '邪_frequency',
       '也_fraction_at_end', '哉_fraction_at_end', '乎_fraction_at_end',
       '兮_fraction_at_end', '夫_fraction_at_end', '焉_fraction_at_end',
       '耶_fraction_at_end', '者_fraction_at_end', '矣_fraction_at_end',
       '耳_fraction_at_end', '邪_fraction_at_end'],
      dtype='object')

In [69]:
df.loc[:, df.columns != 'content'].to_csv('./out/stats.csv', index=False)