# Linguistic Annotation -- EN -- surprisal, frequency

In [1]:
import os
print("📁 Current working directory:", os.getcwd())

📁 Current working directory: /swdata/yin/Cui/Re-Veil/minicons


In [2]:
import pandas as pd
from minicons import scorer
from wordfreq import tokenize
from wordfreq import zipf_frequency
from wordfreq import word_frequency

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set the device
# device = 'cuda:0'

# Load only if not already loaded in the notebook
if 'scorer_model' not in globals():
    print("🔧 Loading model and tokenizer...")
    MODEL = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL, return_dict=True)
    scorer_model = scorer.IncrementalLMScorer(model, tokenizer=tokenizer, device='cpu', stride=200, bos_token=True)
    print("✅ Model and tokenizer loaded.")
else:
    print("✅ Reusing existing model.")

🔧 Loading model and tokenizer...
✅ Model and tokenizer loaded.




In [4]:
def whitespace_tokenizer(text):
    return text.split()

In [5]:
# sentences = ["Thought to be the world’s oldest message in a bottle"]
sentences = ["The sketch of those trucks hasn't", "The sketch of those trucks haven't"]

scores = scorer_model.word_score_tokenized(
    batch=sentences,
    tokenize_function=whitespace_tokenizer,
    bos_token=False,         
    eos_token=False,         
    surprisal=True,          
    base_two=True,            
    bow_correction = True,  
)

In [6]:
scores

[[('The', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
  ('sketch',
   15.969964027404785,
   12.72680950164795,
   14.308034896850586,
   6.200345993041992,
   15.969964027404785,
   12.72680950164795,
   14.308034896850586,
   6.200345993041992),
  ('of',
   3.3498291969299316,
   6.965978622436523,
   11.075484275817871,
   2.9639241695404053,
   3.3498291969299316,
   6.965978622436523,
   11.075484275817871,
   2.9639241695404053),
  ('those',
   9.617044448852539,
   7.5691118240356445,
   12.541632652282715,
   1.8073477745056152,
   9.617044448852539,
   7.5691118240356445,
   12.541632652282715,
   1.8073477745056152),
  ('trucks',
   13.100157737731934,
   10.958047866821289,
   13.142509460449219,
   3.6612987518310547,
   13.100157737731934,
   10.958047866821289,
   13.142509460449219,
   3.6612987518310547),
  ("hasn't",
   12.35189151763916,
   7.586394309997559,
   12.198198318481445,
   3.5115060806274414,
   6.17594575881958,
   3.7931971549987793,
   6.099099159240723,
 

In [7]:
# sentences = ["Thought to be the world’s oldest message in a bottle", "The sketch of those trucks haven't"]

# ts = scorer_model.token_score(sentences, surprisal = True, base_two = True)
# print("Token scores:")
# for i, sentence in enumerate(sentences):
#     print(f"Sentence {i+1}:")
#     for token, surp, entropies, renyi_entropies, min_surprisals in ts[i]:
#         print(f"  Token: {token}, Surprisal: {surp:.4f}, Entropies: {entropies}, Renyi Entropies: {renyi_entropies}, Min Surprisals: {min_surprisals}")
#     print()

In [8]:
file_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/en/trials/onestop_en.tsv'

print(f"📄 Reading file from: {file_path}")
df = pd.read_csv(file_path, sep='\t', dtype={
    "experiment": str, "experiment_id": str, "condition_id": str,
    "stimulus_id": str, "stimulus_name": str, "page_id": str,
    "page_name": str, "item_id": str, "question_id": str, "response_true": str
})
print(f"✅ File loaded. Rows: {len(df)}")

# Drop unused columns
df.drop(columns=[
    'question', 'distractor_1', 'distractor_2', 'distractor_3'
], inplace=True)
print("🧹 Dropped unused columns.")

📄 Reading file from: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/en/trials/onestop_en.tsv
✅ File loaded. Rows: 46
🧹 Dropped unused columns.


In [9]:
# Preprocessing
df['experiment_id'] = 'en1'
df = df.assign(text=df['text'].str.split("@#@")).explode('text')

df['para_id'] = df.groupby("item_id").cumcount()

# Remove empty/whitespace-only paragraphs
df = df[df['text'].str.strip().astype(bool)]
print(f"📚 Cleaned and split text into paragraphs. Total: {len(df)}")

📚 Cleaned and split text into paragraphs. Total: 49


In [10]:
print("⚙️ Calculating token-level surprisal scores (this may take a while)...")
df['surp'] = df['text'].map(lambda x: scorer_model.word_score_tokenized(
    batch=x,
    tokenize_function=whitespace_tokenizer,
    bos_token=False,         
    eos_token=False,         
    surprisal=True,          
    base_two=True,            
    bow_correction = True,  
))
print("✅ Surprisal scores computed.")

⚙️ Calculating token-level surprisal scores (this may take a while)...
✅ Surprisal scores computed.


In [11]:
print("Before first explode:", type(df['surp'].iloc[0]))

Before first explode: <class 'list'>


In [12]:
# Explode token list


df = df.explode('surp')
df = df.explode('surp')

print("After first explode:", type(df['surp'].iloc[0]))
# Split tuple into separate columns
df[['word', 'surp', 'entropy', 'renyi', 'minsurp', 'msurp', 'mentropy', 'mrenyi', 'mminsurp']] = pd.DataFrame(df['surp'].tolist(), index=df.index)


# Remove [CLS] and [SEP]
# df = df[~df['word'].isin(['[CLS]', '[SEP]'])]
# print(f"🚫 Filtered special tokens. Remaining tokens: {len(df)}")

After first explode: <class 'tuple'>


In [13]:
# Drop temporary columns
df.drop(columns=['text'], inplace=True)
# df['word'] = df['word'].apply(lambda x: list(x) if x != '[UNK]' else x)
# df = df.explode('word')

df['page_word_id'] = df.groupby('item_id').cumcount()
df['para_word_id'] = df.groupby(["item_id", "para_id"]).cumcount()
df['stim_word_id'] = df.groupby("stimulus_id").cumcount()
df['exp_word_id'] = df.groupby("experiment_id").cumcount()

df['word_nr'] = df.apply(lambda row: str(row['para_id']) + '-' + str(row['para_word_id']),
                                                  axis=1)
df['word_nr'] = df['word_nr'].astype(str)
df = df.rename(columns={'experiment_id': 'expr_id', 'condition_id': 'cond_id', 'stimulus_id': 'stim_id'})

In [14]:
# for each word in column 'word', get the frequency
# if the word is UNK], set the frequency to the frequency of the Chinese quatotion mark “
def get_zipf_frequency(word):
    if word == '[UNK]':
        return zipf_frequency('“', 'en')
    else:
        return zipf_frequency(word, 'en')

def get_normal_frequency(word):
    if word == '[UNK]':
        return word_frequency('“', 'en')
    else:
        return word_frequency(word, 'en')
    

In [15]:
# Apply the function to the 'word' column
df['zipf_freq'] = df['word'].apply(get_zipf_frequency)
df['normal_freq'] = df['word'].apply(get_normal_frequency)
print("✅ Frequency features added.")

# Add token length
df['word_len'] = df['word'].apply(lambda x: len(x) if x != '[UNK]' else int(1))
print("🔍 Token length feature added.")

✅ Frequency features added.
🔍 Token length feature added.


In [16]:
out_path = '/swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/en/trials/onestop_en_annotation_m.csv'
df.to_csv(out_path, index=False)
print(f"💾 Saved to: {out_path}")

💾 Saved to: /swdata/yin/Cui/Re-Veil/Re-Veil/post_processing/data/en/trials/onestop_en_annotation_m.csv
