# Linguistic Annotation -- ZH (Chr) -- surprisal, frequency

In [1]:
import os
print("📁 Current working directory:", os.getcwd())

📁 Current working directory: /swdata/yin/Cui/Re-Veil/minicons


In [2]:
import pandas as pd
from minicons import scorer
from wordfreq import tokenize
from wordfreq import zipf_frequency
from wordfreq import word_frequency

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import BertTokenizerFast, GPT2LMHeadModel

# Set the device
# device = 'cuda:0'

# Load only if not already loaded in the notebook
if 'scorer_model' not in globals():
    print("🔧 Loading model and tokenizer...")
    MODEL = "uer/gpt2-xlarge-chinese-cluecorpussmall"
    tokenizer = BertTokenizerFast.from_pretrained(MODEL, return_dict=True)
    model = GPT2LMHeadModel.from_pretrained(MODEL)
    scorer_model = scorer.IncrementalLMScorer(model, tokenizer=tokenizer, device='cpu', stride=200)
    print("✅ Model and tokenizer loaded.")
else:
    print("✅ Reusing existing model.")

🔧 Loading model and tokenizer...
✅ Model and tokenizer loaded.


In [13]:
file_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh.tsv'

print(f"📄 Reading file from: {file_path}")
df = pd.read_csv(file_path, sep='\t', dtype={
    "experiment": str, "experiment_id": str, "condition_id": str,
    "stimulus_id": str, "stimulus_name": str, "page_id": str,
    "page_name": str, "item_id": str, "question_id": str, "response_true": str
})
print(f"✅ File loaded. Rows: {len(df)}")

# Drop unused columns
df.drop(columns=[
    'question', 'distractor_1', 'distractor_2', 'distractor_3'
], inplace=True)
print("🧹 Dropped unused columns.")

📄 Reading file from: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh.tsv
✅ File loaded. Rows: 60
🧹 Dropped unused columns.


In [14]:
# Preprocessing
df['experiment_id'] = 'zh1'
df = df.assign(text=df['text'].str.split("@#@")).explode('text')

df['para_id'] = df.groupby("item_id").cumcount()

# Remove empty/whitespace-only paragraphs
df = df[df['text'].str.strip().astype(bool)]
print(f"📚 Cleaned and split text into paragraphs. Total: {len(df)}")

📚 Cleaned and split text into paragraphs. Total: 65


In [15]:
print("⚙️ Calculating token-level surprisal scores (this may take a while)...")
df['surp'] = df['text'].map(lambda x: scorer_model.token_score(x, surprisal=True, base_two=True, rank=True))
print("✅ Surprisal scores computed.")

print(df['surp'])

⚙️ Calculating token-level surprisal scores (this may take a while)...
✅ Surprisal scores computed.
0     [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (一, 6.035219...
0     [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (杨, 10.61592...
1     [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (这, 5.078294...
2     [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), ([UNK], 8.49...
3     [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (随, 11.19388...
                            ...                        
55    [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (谈, 12.49851...
56    [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (这, 5.078294...
57    [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (让, 10.37613...
58    [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), (凯, 12.34901...
59    [[([CLS], 0.0, 0, 0.0, 0.0, 0.0), ([UNK], 8.49...
Name: surp, Length: 65, dtype: object


In [16]:
print("Before first explode:", type(df['surp'].iloc[0]))

Before first explode: <class 'list'>


In [17]:
# Explode token list


df = df.explode('surp')
df = df.explode('surp')

print("After first explode:", type(df['surp'].iloc[0]))
# Split tuple into separate columns
df[['word', 'surp', 'rank', 'entropy', 'renyi', 'minsurp']] = pd.DataFrame(df['surp'].tolist(), index=df.index)

# Remove [CLS] and [SEP]
df = df[~df['word'].isin(['[CLS]', '[SEP]'])]
print(f"🚫 Filtered special tokens. Remaining tokens: {len(df)}")

After first explode: <class 'tuple'>
🚫 Filtered special tokens. Remaining tokens: 5238


In [18]:
# Drop temporary columns
df.drop(columns=['text'], inplace=True)
df['word'] = df['word'].apply(lambda x: list(x) if x != '[UNK]' else x)
df = df.explode('word')

df['page_word_id'] = df.groupby('item_id').cumcount()
df['para_word_id'] = df.groupby(["item_id", "para_id"]).cumcount()
df['stim_word_id'] = df.groupby("stimulus_id").cumcount()
df['exp_word_id'] = df.groupby("experiment_id").cumcount()

df['word_nr'] = df.apply(lambda row: str(row['para_id']) + '-' + str(row['para_word_id']),
                                                  axis=1)
df['word_nr'] = df['word_nr'].astype(str)
df = df.rename(columns={'experiment_id': 'expr_id', 'condition_id': 'cond_id', 'stimulus_id': 'stim_id'})

In [19]:
# for each word in column 'word', get the frequency
# if the word is UNK], set the frequency to the frequency of the Chinese quatotion mark “
def get_zipf_frequency(word):
    if word == '[UNK]':
        return zipf_frequency('“', 'zh')
    else:
        return zipf_frequency(word, 'zh')

def get_normal_frequency(word):
    if word == '[UNK]':
        return word_frequency('“', 'zh')
    else:
        return word_frequency(word, 'zh')
    

In [20]:
# Apply the function to the 'word' column
df['zipf_freq'] = df['word'].apply(get_zipf_frequency)
df['normal_freq'] = df['word'].apply(get_normal_frequency)
print("✅ Frequency features added.")

# Add token length
df['word_len'] = df['word'].apply(lambda x: len(x) if x != '[UNK]' else int(1))
print("🔍 Token length feature added.")

✅ Frequency features added.
🔍 Token length feature added.


In [21]:
out_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh_annotation_chr.csv'
df.to_csv(out_path, index=False)
print(f"💾 Saved to: {out_path}")

💾 Saved to: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/zh/trials/onestop_zh_annotation_chr.csv


### TEST FUNCTIONS

In [36]:
from wordfreq import word_frequency
sentences = ["在床上赖了一天，我想起床贵", "在床上赖了一天，我想起床了" ]
# print("💬 Example sentence:", sentences)

# print("⚙️ Calculating token-level surprisal scores for example sentence...")
# surps = scorer_model.token_score(sentences, surprisal=True, base_two=True)

# print("✅ Surprisal scores computed for example sentence.")
# for ele in surps:
#     print(ele)
#     # for word, surp, entropy, renyi, minsurp in ele:
#     #     print(f"Word: {word}, Surprisal: {surp}, Entropy: {entropy}, Rényi: {renyi}, MinSurp: {minsurp}")
# print("🔚 Finished processing.")

In [None]:
print(word_frequency('您们', 'zh'))

2.23e-05


In [None]:
tokens = tokenize(sentences[0], 'zh')

for token in tokens:
    print(token)
    print(zipf_frequency(token, 'zh'))

在
7.16
床上
4.6
赖
4.17
了
7.14
一天
5.25
我
6.95
想
6.03
起床
4.23
贵
4.64
