# Linguistic Annotation -- ZH (word) -- surprisal, frequency

In [1]:
import os
print("📁 Current working directory:", os.getcwd())

📁 Current working directory: /swdata/yin/Cui/Re-Veil/minicons


In [3]:
import pandas as pd
from wordfreq import tokenize
from wordfreq import zipf_frequency
from wordfreq import word_frequency

In [4]:
file_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh.tsv'

print(f"📄 Reading file from: {file_path}")
df = pd.read_csv(file_path, sep='\t', dtype={
    "experiment": str, "experiment_id": str, "condition_id": str,
    "stimulus_id": str, "stimulus_name": str, "page_id": str,
    "page_name": str, "item_id": str, "question_id": str, "response_true": str
})
print(f"✅ File loaded. Rows: {len(df)}")

# Drop unused columns
df.drop(columns=[
    'question', 'distractor_1', 'distractor_2', 'distractor_3'
], inplace=True)
print("🧹 Dropped unused columns.")

# Preprocessing
df['experiment_id'] = 'zh1'
df = df.assign(text=df['text'].str.split("@#@")).explode('text')

df['para_id'] = df.groupby("item_id").cumcount()

# Remove empty/whitespace-only paragraphs
df = df[df['text'].str.strip().astype(bool)]
print(f"📚 Cleaned and split text into paragraphs. Total: {len(df)}")

📄 Reading file from: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh.tsv
✅ File loaded. Rows: 60
🧹 Dropped unused columns.
📚 Cleaned and split text into paragraphs. Total: 65


In [5]:
import spacy
nlp = spacy.load("zh_core_web_sm")

# tokenize the text
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# Apply the function to the 'text' column
df['tokens'] = df['text'].apply(tokenize_text)
# Explode the tokens
df = df.explode('tokens')
df.drop(columns=['text'], inplace=True)
df['page_token_id'] = df.groupby('item_id').cumcount()
df['para_token_id'] = df.groupby(["item_id", "para_id"]).cumcount()
df['stim_token_id'] = df.groupby("stimulus_id").cumcount()
df['exp_token_id'] = df.groupby("experiment_id").cumcount()

# Create a new column 'char' with the splited characters in tokens
df['word'] = df['tokens'].apply(lambda x: list(x))

In [6]:
df = df.explode('word')
print(df['word'].iloc[0])

一


In [7]:
df['page_word_id'] = df.groupby('item_id').cumcount()
df['para_word_id'] = df.groupby(["item_id", "para_id"]).cumcount()
df['stim_word_id'] = df.groupby("stimulus_id").cumcount()
df['exp_word_id'] = df.groupby("experiment_id").cumcount()

df['word_nr'] = df.apply(lambda row: str(row['para_id']) + '-' + str(row['para_word_id']),
                                                  axis=1)
df['word_nr'] = df['word_nr'].astype(str)
df = df.rename(columns={'experiment_id': 'expr_id', 'condition_id': 'cond_id', 'stimulus_id': 'stim_id'})

In [8]:
print(df.head())

  experiment expr_id condition cond_id stim_id   stimulus_name page_id  \
0  reveil-zh     zh1      real       1       1  Bottle-Message       1   
0  reveil-zh     zh1      real       1       1  Bottle-Message       1   
0  reveil-zh     zh1      real       1       1  Bottle-Message       1   
0  reveil-zh     zh1      real       1       1  Bottle-Message       1   
0  reveil-zh     zh1      real       1       1  Bottle-Message       1   

  item_id question_id response_true  ...  page_token_id para_token_id  \
0       1         NaN           NaN  ...              0             0   
0       1         NaN           NaN  ...              0             0   
0       1         NaN           NaN  ...              0             0   
0       1         NaN           NaN  ...              0             0   
0       1         NaN           NaN  ...              1             1   

   stim_token_id  exp_token_id  word  page_word_id para_word_id  stim_word_id  \
0              0             0     

In [10]:
# for each word in column 'word', get the frequency
# if the word is UNK], set the frequency to the frequency of the Chinese quatotion mark “
def get_zipf_frequency(word):
    if word == '[UNK]':
        return zipf_frequency('“', 'zh')
    else:
        return zipf_frequency(word, 'zh')

def get_normal_frequency(word):
    if word == '[UNK]':
        return word_frequency('“', 'zh')
    else:
        return word_frequency(word, 'zh')
    

In [11]:
# Apply the function to the 'word' column
df['zipf_freq'] = df['tokens'].apply(get_zipf_frequency)
df['normal_freq'] = df['tokens'].apply(get_normal_frequency)
print("✅ Frequency features added.")

# Add token length
df['token_len'] = df['tokens'].apply(lambda x: len(x) if x != '[UNK]' else int(1))
print("🔍 Token length feature added.")

Building prefix dict from /swdata/yin/miniconda3/envs/annotation/lib/python3.10/site-packages/wordfreq/data/jieba_zh.txt ...
Loading model from cache /tmp/jieba.u009235f70d051c3f8b73eccee1f6adf1.cache
Loading model cost 0.059 seconds.
Prefix dict has been built successfully.


✅ Frequency features added.
🔍 Token length feature added.


In [12]:
annotation_chr = pd.read_csv('/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh_annotation_chr.csv', dtype={
    "experiment": str, "expr_id": str, "cond_id": str, "condition": str,
    "stim_id": str, "stimulus_name": str, "page_id": str,
     "item_id": str, "question_id": str, "response_true": str
})
# drop the columns that are not needed: zipf_freq, normal_freq, word_len
annotation_chr.drop(columns=['zipf_freq', 'normal_freq', 'word_len'], inplace=True)
shared_columns = [
    'experiment', 'expr_id', 'condition', 'cond_id', 'stim_id', 'stimulus_name',
    'page_id', 'item_id', 'question_id', 'response_true',
    'para_id', 'page_word_id', 'para_word_id', 'stim_word_id', 'exp_word_id', 'word_nr'
]

df = df.merge(annotation_chr, on=shared_columns, how='left')
df = df.drop(columns=['word_y'])
df.rename(columns={'word_x': 'word'}, inplace=True)

In [13]:
agg_df = df.groupby('exp_token_id')[['surp', 'entropy', 'renyi', 'minsurp']].sum().reset_index()
df = df.drop(columns=['surp', 'entropy', 'renyi', 'minsurp'])  # Remove old values
df = df.merge(agg_df, on='exp_token_id', how='left')

In [14]:
out_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh_annotation_words.csv'
df.to_csv(out_path, index=False)
print(f"💾 Saved to: {out_path}")

💾 Saved to: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh_annotation_words.csv
