## Extract stimulus texts and original labels
Set up `data_dir` and run the following blocks one-by-one to:
1. extract texts and labels from `.csv` files;
2. revise some known typos.


In [1]:
import numpy as np
import pandas as pd
data_dir = './raw_data'

### ZuCo1-task1 (Normal Reading)
- Copy all `...ZuCo1/task_materials/xxx.csv` files into a new created dir (e.g., `.../ZuCo1/revised_csv/`).
- There are several format errors in some original `.csv` files (e.g., absence of column headers), you may take a few minutes to *manually correct* them according to the ERROR messages.

In [2]:
%pwd

'c:\\MSc Files\\MSc Project\\E2T-w-VJEPA\\glim-way\\GLIM\\data'

In [2]:
data_dir_1 = './raw_data/ZuCo1/revised_csv/'

In [4]:
df11_raw = pd.read_csv('./raw_data/ZuCo1/revised_csv/sentiment_labels_task1.csv', 
                       sep=';', header=0,  skiprows=[1], encoding='utf-8',
                       dtype={'sentence': str, 'control': str, 'sentiment_label':str})
# print(df1_raw)
# n_row, n_column = df11_raw.shape
df11 = df11_raw.rename(columns={'sentence': 'raw text', 
                            'sentiment_label': 'raw label'})
df11 = df11.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
                      
df11['dataset'] =  ['ZuCo1'] * df11.shape[0]  # each item is init as a tuple with len==1 for easy extension
df11['task'] =  ['task1'] * df11.shape[0]
df11['control'] = df11['control'].apply(lambda x: x == 'CONTROL')
print(df11.shape, df11.columns)
print(df11['raw text'].nunique())

(400, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')
0


### ZuCo1-task2 (Normal Reading)  
Note: there are multiple relation labels in some of the sentences

In [5]:
def reformat_relation_types(text):
    '''
    `VISITED` --> tuple(`VISITED`,)
    `AWARD;JOB_TITLE;NATIONALITY` --> tuple(`AWARD`,`JOB_TITLE`,`NATIONALITY`)
    `NO-RELATION` --> np.nan
    '''
    assert isinstance(text, str)
    if text == 'NO-RELATION':
        text = np.nan
    else:
        text = tuple(text.split(';'))
    return text

df12_raw = pd.read_csv(data_dir_1 + 'relations_labels_task2.csv', 
                       sep=',', header=0, encoding='utf-8',
                       dtype={'sentence': str,'control': str,'relation_types':str})
# n_row, n_column = df12_raw.shape
df12 = df12_raw.rename(columns={'sentence': 'raw text', 
                                'relation_types': 'raw label'})
df12 = df12.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
df12['dataset'] =  ['ZuCo1'] * df12.shape[0]
df12['task'] =  ['task2'] * df12.shape[0]
df12['control'] = df12['control'].apply(lambda x: x == 'CONTROL')
df12['raw label'] = df12['raw label'].apply(reformat_relation_types)
print(df12.shape, df12.columns)
print(df12['raw text'].nunique())

(300, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')
300


### ZuCo1-task3 (Task-specific Reading)  
Note: there are repeated sentences yet with different relation labels

In [6]:
# def assign_control_with_label(label):
#     assert label in ['AWARD', 'EDUCATION', 'EMPLOYER', 
#                    'FOUNDER', 'JOB_TITLE', 'NATIONALITY', 
#                    'POLITICAL_AFFILIATION', 'VISITED', 'WIFE',
#                    'CONTROL']
#     return True if label == 'CONTROL' else False

# df13_raw = pd.read_csv(data_dir_1 + '/relations_labels_task3.csv', 
#                        sep=';', header=0, encoding='utf-8', 
#                        dtype={'sentence': str, 'relation-type':str})
# df13 = df13_raw.rename(columns={'sentence': 'raw text', 
#                             'relation-type': 'raw label'})
# df13 = df13.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
# df13['dataset'] =  ['ZuCo1'] * df13.shape[0]
# df13['task'] =  ['task3'] * df13.shape[0]
# df13['control'] = df13['raw label'].apply(assign_control_with_label)
# # df13['label'] = df13['label'].apply(lambda x: x if x!='CONTROL' else np.nan)
# for i in range(df13.shape[0]):
#     label = df13.loc[i, 'raw label']
#     if label == 'CONTROL':
#         left = df13.loc[i-1, 'raw label']
#         right = df13.loc[i+1, 'raw label']
#         assert left == right
#         df13.loc[i, 'raw label'] = left

# print(df13.shape, df13.columns)
# print(df13['raw text'].nunique())
def assign_control_with_label(label):
    assert label in ['AWARD', 'EDUCATION', 'EMPLOYER', 
                   'FOUNDER', 'JOB_TITLE', 'NATIONALITY', 
                   'POLITICAL_AFFILIATION', 'VISITED', 'WIFE',
                   'CONTROL']
    return True if label == 'CONTROL' else False

df13_raw = pd.read_csv(data_dir_1 + '/relations_labels_task3.csv', 
                       sep=';', header=0, encoding='utf-8', 
                       dtype={'sentence': str, 'relation-type':str})
df13 = df13_raw.rename(columns={'sentence': 'raw text', 
                            'relation-type': 'raw label'})
df13 = df13.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
df13['dataset'] =  ['ZuCo1'] * df13.shape[0]
df13['task'] =  ['task3'] * df13.shape[0]
df13['control'] = df13['raw label'].apply(assign_control_with_label)

# ✅ FIX: Reset the index to ensure 0, 1, 2, ... indexing
df13 = df13.reset_index(drop=True)

for i in range(df13.shape[0]):
    label = df13.loc[i, 'raw label']
    if label == 'CONTROL':
        left = df13.loc[i-1, 'raw label']
        right = df13.loc[i+1, 'raw label']
        assert left == right
        df13.loc[i, 'raw label'] = left

print(df13.shape, df13.columns)
print(df13['raw text'].nunique())

(407, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')
386


### ZuCo2-task2 (Normal Reading)  
Note: there repeated sentences with unkown labels, we will drop them at the next step

In [7]:
def extract_merge(file_dir, n=1):
    sentence_path = file_dir + f'/nr_{n}.csv'
    control_path = file_dir + f'/nr_{n}_control_questions.csv'
    df_raw = pd.read_csv(sentence_path, sep=';', encoding='utf-8', header=None,
                         names = ['paragraph_id', 'sentence_id','sentence','control'],
                         dtype={'paragraph_id':str, 'sentence_id': str, 'sentence': str, 'control': str})
    df_control = pd.read_csv(control_path, sep=';', encoding='utf-8', header=0,
                             dtype={'paragraph_id':str, 'sentence_id': str,'control_question': str, 'correct_answer':str})
    assert df_raw[df_raw['control']=='CONTROL'].shape[0] == df_control.shape[0]
    df = pd.merge(df_raw, df_control, how='left', on=['paragraph_id', 'sentence_id'])
    return df

def merge_QA(q,a):
    if pd.isna(q):
        label = np.nan
    else:
        if q.endswith('...'):
            label = q.replace('...', ' '+a)
        elif q.endswith('?'):
            label = q + ' ' + a
        else:
            raise ValueError
    return label

file_dir = './raw_data/ZuCo2/task_materials'
df22_list = []
for i in range(1,8):
    df = extract_merge(file_dir, i)
    df22_list.append(df)
df22 = pd.concat(df22_list, ignore_index=True,)

labels=[]
for i in range(df22.shape[0]):
    label = merge_QA(df22['control_question'][i], df22['correct_answer'][i])
    labels.append(label)
df22['raw label'] = labels
df22['control'] = df22['control'].apply(lambda x: x == 'CONTROL')

df22 = df22.rename(columns={'sentence': 'raw text'})
df22 = df22.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
df22['dataset'] =  ['ZuCo2'] * df22.shape[0]
df22['task'] =  ['task2'] * df22.shape[0]
print(df22.shape[0], df22.columns)
print(df22['raw text'].nunique())
# print(df22['raw text'].value_counts())

370 Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')
365


### ZuCo2-task3 (Task-specific Reading)

In [8]:
def extract_task3(file_dir, n=1):
    file_path = file_dir + f'/tsr_{n}.csv'
    df_raw = pd.read_csv(file_path, sep=';', encoding='utf-8', header=None,
                         names = ['paragraph_id', 'sentence_id', 'sentence', 'label'],
                         dtype={'paragraph_id':str, 'sentence_id': str, 'sentence': str, 'label': str})
    df = df_raw.rename(columns={'sentence': 'raw text', 
                                'label': 'raw label'})
    df = df.reindex(columns=['raw text', 'dataset', 'task', 'control', 'raw label',])
    df['control'] = df['raw label'].apply(assign_control_with_label)
    unique_labels = df['raw label'].unique().tolist()
    unique_labels.remove('CONTROL')
    assert len(unique_labels) == 1
    df['raw label'] =  unique_labels * df.shape[0]
    df['dataset'] =  ['ZuCo2'] * df.shape[0]
    df['task'] =  ['task3'] * df.shape[0]
    return df

def assign_control_with_label(label):
    assert label in ['AWARD', 'EDUCATION', 'EMPLOYER', 
                   'FOUNDER', 'JOB_TITLE', 'NATIONALITY', 
                   'POLITICAL_AFFILIATION', 'VISITED', 'WIFE',
                   'CONTROL']
    return True if label == 'CONTROL' else False

file_dir = data_dir + '/ZuCo2/task_materials'
df23_list = []
for i in range(1,8):
    df = extract_task3(file_dir,i)
    df23_list.append(df)
df23 = pd.concat(df23_list, ignore_index=True,)
print(df23.shape[0], df23.columns)
print(df23['raw text'].nunique())

411 Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')
392


## Concat sub-tables and revise typos 

In [9]:
df = pd.concat([df11, df12, df13, df22, df23], ignore_index=True,)
print(df.shape, df.columns)

(1888, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')


In [10]:
df = pd.concat([df11, df12, df13, df22], ignore_index=True,)
print(df.shape, df.columns)

(1477, 5) Index(['raw text', 'dataset', 'task', 'control', 'raw label'], dtype='object')


### Revise each `raw text` according the typos we identified.

In [None]:
# typobook = {"emp11111ty":   "empty",
#             "film.1":       "film.",
#             "–":            "-",
#             "’s":           "'s",
#             "�s":           "'s",
#             "`s":           "'s",
#             "Maria":        "Marić",
#             "1Universidad": "Universidad",
#             "1902—19":      "1902 - 19",
#             "Wuerttemberg": "Württemberg",
#             "long -time":   "long-time",
#             "Jose":         "José",
#             "Bucher":       "Bôcher",
#             "1839 ? May":   "1839 - May",
#             "G�n�ration":  "Generation",
#             "Bragança":     "Bragana",
#             "1837?October": "1837 - October",
#             "nVera-Ellen":  "Vera-Ellen",
#             "write Ethics": "wrote Ethics",
#             "Adams-Onis":   "Adams-Onís",
#             "(40 km?)":     "(40 km²)",
#             "(40 km˝)":     "(40 km²)",
#             " (IPA: /?g?nz?b?g/) ": " ",
#             '""Canes""':    '"Canes"',

#             }

# def revise_typo(text):
#     # the typo book 
#     book = typobook
#     for src, tgt in book.items():
#         if src in text:
#             text = text.replace(src, tgt)
#     return text

# df['input text'] = df['raw text'].apply(revise_typo)
# print(df.columns)
# print(df['raw text'].nunique(), df['input text'].nunique())

TypeError: argument of type 'float' is not iterable

In [11]:
import pandas as pd

typobook = {"emp11111ty":   "empty",
            "film.1":       "film.",
            "–":            "-",
            "'s":           "'s",
            "�s":           "'s",
            "`s":           "'s",
            "Maria":        "Marić",
            "1Universidad": "Universidad",
            "1902—19":      "1902 - 19",
            "Wuerttemberg": "Württemberg",
            "long -time":   "long-time",
            "Jose":         "José",
            "Bucher":       "Bôcher",
            "1839 ? May":   "1839 - May",
            "G�n�ration":  "Generation",
            "Bragança":     "Bragana",
            "1837?October": "1837 - October",
            "nVera-Ellen":  "Vera-Ellen",
            "write Ethics": "wrote Ethics",
            "Adams-Onis":   "Adams-Onís",
            "(40 km?)":     "(40 km²)",
            "(40 km˝)":     "(40 km²)",
            " (IPA: /?g?nz?b?g/) ": " ",
            '""Canes""':    '"Canes"',
            }

def revise_typo(text):
    # Handle NaN/None values
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    book = typobook
    for src, tgt in book.items():
        if src in text:
            text = text.replace(src, tgt)
    return text

# Check for NaN values first
print(f"NaN values in 'raw text': {df['raw text'].isna().sum()}")

df['input text'] = df['raw text'].apply(revise_typo)
print(df.columns)
print(df['raw text'].nunique(), df['input text'].nunique())

NaN values in 'raw text': 400
Index(['raw text', 'dataset', 'task', 'control', 'raw label', 'input text'], dtype='object')
943 925


In [12]:
pd.to_pickle(df, './tmp/zuco_label_input_text2.df')

In [13]:
%pwd

'c:\\MSc Files\\MSc Project\\E2T-w-VJEPA\\gated-glim\\GLIM\\data'

In [14]:
import pandas as pd
import os

# Check if the final training file exists
data_path = './tmp/zuco_eeg_label_8variants.df'

if os.path.exists(data_path):
    df = pd.read_pickle(data_path)
    print("✅ File exists!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Check for required columns
    required = ['eeg', 'mask', 'phase', 'input text']
    missing = [c for c in required if c not in df.columns]
    
    if missing:
        print(f"❌ Missing required columns: {missing}")
    else:
        print("✅ All required columns present!")
        print(f"\nTrain samples: {(df['phase']=='train').sum()}")
        print(f"Val samples: {(df['phase']=='val').sum()}")
        print(f"Test samples: {(df['phase']=='test').sum()}")
else:
    print(f"❌ File not found: {data_path}")

✅ File exists!
Shape: (16316, 22)
Columns: ['eeg', 'mask', 'subject', 'label id', 'raw text', 'dataset', 'task', 'control', 'raw label', 'input text', 'text uid', 'sentiment label', 'relation label', 'lexical simplification (v0)', 'lexical simplification (v1)', 'semantic clarity (v0)', 'semantic clarity (v1)', 'syntax simplification (v0)', 'syntax simplification (v1)', 'naive rewritten', 'naive simplified', 'phase']
✅ All required columns present!

Train samples: 13023
Val samples: 1631
Test samples: 1662
