In [None]:
# =============================================================================
# Step 1: Environment Setup - Mount Drive and Install Dependencies
# =============================================================================
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install torch pandas tqdm scikit-learn
!pip install torch_geometric
!pip install -e /content/drive/MyDrive/PhD/Study2_review/Published_Codes_Study_2/Production/codes
!pip install accelerate
!pip install -U bitsandbytes
!pip install peft

Mounted at /content/drive
Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0
Obtaining file:///content/drive/MyDrive/PhD/Study2_review/Published_Codes_Study_2/Production/codes
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: codes
  Running setup.py develop for codes
Successfully installed codes-0.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =============================================================================
# Step 2: Imports, Deterministic Setup, BERT Model & Tokenizer
# =============================================================================
import os
import warnings
warnings.filterwarnings("ignore")

import random
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from tqdm import tqdm
import sqlite3
import json

tqdm.pandas()

# -------- Deterministic setup --------
SEED = 9898
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # helps determinism on CUDA

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# If some ops are not deterministic, warn_only=True avoids hard crashes.
torch.use_deterministic_algorithms(True, warn_only=True)

# -------- Device --------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------- BERT model & tokenizer (FP32/FP16, NOT 8-bit) --------
model_name = "google-bert/bert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModel.from_pretrained(model_name)
# Optional: use half precision if VRAM is tight
# model = model.half()
model.to(device)
model.eval()  # VERY important: turns off dropout inside BERT

# =============================================================================
# Step 3: Deterministic BERT Encoding Function
# =============================================================================
def bertEncode(inputs, max_len=100, batch_size=32):
    """
    Encodes a list of text inputs into BERT embeddings using mini-batches.

    Args:
        inputs (list of str): Text data to encode.
        max_len (int): Maximum sequence length for encoding.
        batch_size (int): Number of texts to process at once.

    Returns:
        list: List of embeddings (each embedding is a 1D numpy array).
    """
    all_embs = []

    # Deterministic batching over a fixed range
    for start in range(0, len(inputs), batch_size):
        batch_texts = inputs[start:start + batch_size]

        encoding = tokenizer(
            batch_texts,
            truncation=True,
            return_tensors="pt",
            add_special_tokens=True,
            max_length=max_len,
            padding="max_length",
        )

        with torch.no_grad():
            outputs = model(
                input_ids=encoding["input_ids"].to(device),
                attention_mask=encoding["attention_mask"].to(device),
            )
            # More stable than mean-pooling: CLS token
            batch_embs = outputs.last_hidden_state[:, 0, :]  # [batch, hidden]

        all_embs.extend(batch_embs.cpu().numpy())

    return all_embs


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
import re

def compact_tesa(raw: str) -> str:
    """
    Convert verbose TESA like:
       'neutral emotion & AGAINST stance towards Warren; ...'
    into grouped compact tags:
       '[Emotion:neutral Stance:AGAINST Target:Warren]'
    """
    if pd.isna(raw) or not isinstance(raw, str):
        return ""

    segments = [s.strip(" ,") for s in raw.split(";") if s.strip(" ,")]

    tags = []
    for seg in segments:
        try:
            # Extract emotion
            emo = seg.split(" emotion")[0].strip()

            # Extract stance
            after_amp = seg.split("&", 1)[1]
            stance = after_amp.split(" stance")[0].strip()

            # Extract target
            target = seg.split("towards", 1)[1].strip(" ,")
            target = target.replace(" ", "_")
            target = re.sub(r"[^A-Za-z0-9_]", "", target)

            tags.append(f"[Emotion:{emo} Stance:{stance} Target:{target}]")
        except:
            continue

    return " ".join(tags)




In [None]:
#checking if 500 max token is ok
import sqlite3
import pandas as pd
from transformers import AutoTokenizer


# =============================================================================
# Step 4: Set Up Dataset Paths and Configurations
# =============================================================================
base_path = '/content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data'

# Define dataset paths and their input mode configurations (each mode has a max length setting)
datasets = {
    f'{base_path}/fibvid': {'text_emo': 500, 'text_only': 500},
    f'{base_path}/ts':     {'text_emo': 500, 'text_only': 500},
}

# =============================================================================
# Step 5: Process Datasets and Generate BERT Embeddings into Separate Databases
# =============================================================================
for dataset_path, mode_config in datasets.items():
    db_path = f'{dataset_path}_data.db'
    print(f"\n==============================")
    print(f"Processing database: {db_path}")
    con = sqlite3.connect(db_path)

    for input_mode, max_len in mode_config.items():
        print(f"\n----- Mode: {input_mode} (max_len={max_len}) -----")

        # --- Load data deterministically ---
        data = pd.read_sql_query('SELECT * FROM data', con)

        print(f"\nRaw data preview for {dataset_path} ({input_mode}):")
        display(data.head())
        print("Example raw text:")
        display(data.text.iloc[0])

        # --- Build the input text used for BERT ---
        if input_mode == 'text_emo':
            # Emotion + stance + original text
            #data['input'] = data['target_emotion_stance'].fillna('') + '[SEP]' + data['text'].fillna('')
            data['tesa_tags'] = data['target_emotion_stance'].fillna("").apply(compact_tesa)
            data['input'] = data['tesa_tags'] + " [SEP] " + data['text'].fillna("")

        else:
            # Text only
            data['input'] = data['text'].fillna('')

        # --- NEW: compute token lengths for the whole dataset (no truncation) ---
        data['token_len'] = data['input'].apply(
            lambda x: len(tokenizer.encode(x, truncation=False))
        )

        print("Example input text:")
        display(data.input.iloc[0])

        print("\nToken length stats:")
        print(data['token_len'].describe())

        # How many rows exceed max_len?
        trunc_count = (data['token_len'] > max_len).sum()
        total = len(data)
        print(f"\nRows exceeding max_len={max_len}: {trunc_count} / {total} "
              f"({trunc_count / total:.2%})")

        # Optional: quick sanity check of a few long examples
        if trunc_count > 0:
            print("\nExample rows that will be truncated:")
            display(
                data.loc[data['token_len'] > max_len, ['token_len', 'input']]
                    .head(3)
            )

        # === continue with your embedding generation code below this ===
        # e.g., using a dataloader over data['input'] with truncation=True, max_length=max_len



Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid_data.db

----- Mode: text_emo (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors


Example input text:


'[Emotion:neutral Stance:AGAINST Target:Warren] [Emotion:neutral Stance:AGAINST Target:Pelosi] [Emotion:neutral Stance:NEUTRAL Target:the_Judicial_Branch] [Emotion:neutral Stance:NEUTRAL Target:the_Judiciary_amp] [SEP] First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Token length stats:
count    83910.000000
mean        71.806364
std         45.697805
min         19.000000
25%         40.000000
50%         61.000000
75%         91.000000
max        883.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 74 / 83910 (0.09%)

Example rows that will be truncated:


Unnamed: 0,token_len,input
1050,636,[Emotion:sadness Stance:NEUTRAL Target:actionp...
1606,584,[Emotion:neutral Stance:NEUTRAL Target:Again_a...
3932,587,[Emotion:neutral Stance:NEUTRAL Target:Robfrom...



----- Mode: text_only (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Example input text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Token length stats:
count    83910.000000
mean        38.337445
std         26.787133
min          3.000000
25%         19.000000
50%         33.000000
75%         55.000000
max        400.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 0 / 83910 (0.00%)

Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_data.db

----- Mode: text_emo (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example input text:


'[Emotion:neutral Stance:FAVOR Target:Mike_Huckabee] [Emotion:neutral Stance:NEUTRAL Target:Weds] [SEP] I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Token length stats:
count    103672.000000
mean        105.079173
std          52.461774
min          23.000000
25%          72.000000
50%          97.000000
75%         127.000000
max         752.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 345 / 103672 (0.33%)

Example rows that will be truncated:


Unnamed: 0,token_len,input
13004,617,[Emotion:neutral Stance:NEUTRAL Target:Boricua...
13425,539,[Emotion:joy Stance:NEUTRAL Target:SurrettLind...
13505,567,[Emotion:neutral Stance:NEUTRAL Target:Ericnas...



----- Mode: text_only (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example input text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Token length stats:
count    103672.000000
mean         58.406011
std          30.227253
min           6.000000
25%          39.000000
50%          58.000000
75%          72.000000
max         405.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 0 / 103672 (0.00%)


In [None]:
#checking if 500 max token is ok
import sqlite3
import pandas as pd
from transformers import AutoTokenizer


# =============================================================================
# Step 4: Set Up Dataset Paths and Configurations
# =============================================================================
base_path = '/content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data'

# Define dataset paths and their input mode configurations (each mode has a max length setting)
datasets = {
    f'{base_path}/fibvid': {'text_emo': 500, 'text_only': 500},
    f'{base_path}/ts':     {'text_emo': 500, 'text_only': 500},
}

# =============================================================================
# Step 5: Process Datasets and Generate BERT Embeddings into Separate Databases
# =============================================================================
for dataset_path, mode_config in datasets.items():
    db_path = f'{dataset_path}_data.db'
    print(f"\n==============================")
    print(f"Processing database: {db_path}")
    con = sqlite3.connect(db_path)

    for input_mode, max_len in mode_config.items():
        print(f"\n----- Mode: {input_mode} (max_len={max_len}) -----")

        # --- Load data deterministically ---
        data = pd.read_sql_query('SELECT * FROM data', con)

        print(f"\nRaw data preview for {dataset_path} ({input_mode}):")
        display(data.head())
        print("Example raw text:")
        display(data.text.iloc[0])

        # --- Build the input text used for BERT ---
        if input_mode == 'text_emo':
            # Emotion + stance + original text
            #data['input'] = data['target_emotion_stance'].fillna('') + '[SEP]' + data['text'].fillna('')
            data['tesa_tags'] = data['target_emotion_stance'].fillna("").apply(compact_tesa)
            data['input'] = data['tesa_tags'] + " [SEP] " + data['text'].fillna("")

        else:
            # Text only
            data['input'] = data['text'].fillna('')

        # --- NEW: compute token lengths for the whole dataset (no truncation) ---
        data['token_len'] = data['input'].apply(
            lambda x: len(tokenizer.encode(x, truncation=False))
        )

        print("Example input text:")
        display(data.input.iloc[0])

        print("\nToken length stats:")
        print(data['token_len'].describe())

        # How many rows exceed max_len?
        trunc_count = (data['token_len'] > max_len).sum()
        total = len(data)
        print(f"\nRows exceeding max_len={max_len}: {trunc_count} / {total} "
              f"({trunc_count / total:.2%})")

        # Optional: quick sanity check of a few long examples
        if trunc_count > 0:
            print("\nExample rows that will be truncated:")
            display(
                data.loc[data['token_len'] > max_len, ['token_len', 'input']]
                    .head(3)
            )

        # === continue with your embedding generation code below this ===
        # e.g., using a dataloader over data['input'] with truncation=True, max_length=max_len



Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid_data.db

----- Mode: text_emo (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Example input text:


'[Emotion:neutral Stance:AGAINST Target:Warren] [Emotion:neutral Stance:AGAINST Target:Pelosi] [Emotion:neutral Stance:NEUTRAL Target:the_Judicial_Branch] [Emotion:neutral Stance:NEUTRAL Target:the_Judiciary_amp] [SEP] First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Token length stats:
count    83910.000000
mean        71.806364
std         45.697805
min         19.000000
25%         40.000000
50%         61.000000
75%         91.000000
max        883.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 74 / 83910 (0.09%)

Example rows that will be truncated:


Unnamed: 0,token_len,input
1050,636,[Emotion:sadness Stance:NEUTRAL Target:actionp...
1606,584,[Emotion:neutral Stance:NEUTRAL Target:Again_a...
3932,587,[Emotion:neutral Stance:NEUTRAL Target:Robfrom...



----- Mode: text_only (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Example input text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Token length stats:
count    83910.000000
mean        38.337445
std         26.787133
min          3.000000
25%         19.000000
50%         33.000000
75%         55.000000
max        400.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 0 / 83910 (0.00%)

Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_data.db

----- Mode: text_emo (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example input text:


'[Emotion:neutral Stance:FAVOR Target:Mike_Huckabee] [Emotion:neutral Stance:NEUTRAL Target:Weds] [SEP] I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Token length stats:
count    103672.000000
mean        105.079173
std          52.461774
min          23.000000
25%          72.000000
50%          97.000000
75%         127.000000
max         752.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 345 / 103672 (0.33%)

Example rows that will be truncated:


Unnamed: 0,token_len,input
13004,617,[Emotion:neutral Stance:NEUTRAL Target:Boricua...
13425,539,[Emotion:joy Stance:NEUTRAL Target:SurrettLind...
13505,567,[Emotion:neutral Stance:NEUTRAL Target:Ericnas...



----- Mode: text_only (max_len=500) -----

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example input text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Token length stats:
count    103672.000000
mean         58.406011
std          30.227253
min           6.000000
25%          39.000000
50%          58.000000
75%          72.000000
max         405.000000
Name: token_len, dtype: float64

Rows exceeding max_len=500: 0 / 103672 (0.00%)


In [None]:
# =============================================================================
# Step 4: Set Up Dataset Paths and Configurations
# =============================================================================
base_path = '/content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data'

# Define dataset paths and their input mode configurations (each mode has a max length setting)
datasets = {
    f'{base_path}/fibvid': {'text_emo': 500, 'text_only': 500},
    f'{base_path}/ts':     {'text_emo': 500, 'text_only': 500},
}

# =============================================================================
# Step 5: Process Datasets and Generate BERT Embeddings into Separate Databases
# =============================================================================
for dataset_path, mode_config in datasets.items():
    db_path = f'{dataset_path}_data.db'
    print(f"Processing database: {db_path}")
    con = sqlite3.connect(db_path)

    for input_mode, max_len in mode_config.items():
        # --- Load data deterministically ---
        data = pd.read_sql_query('SELECT * FROM data', con)

        print(f"\nRaw data preview for {dataset_path} ({input_mode}):")
        display(data.head())
        print("Example raw text:")
        display(data.text.iloc[0])

        # --- Build the input text used for BERT ---
        if input_mode == 'text_emo':
            # Emotion + stance + original text
            #data['input'] = data['target_emotion_stance'].fillna('') + '[SEP]' + data['text'].fillna('')
            data['tesa_tags'] = data['target_emotion_stance'].fillna("").apply(compact_tesa)
            data['input'] = data['tesa_tags'] + " [SEP] " + data['text'].fillna("")



        else:
            # Text only
            data['input'] = data['text']

        print("Example constructed input:")
        display(data.input.iloc[0])

        # --- Encode using deterministic BERT encoder ---
        print(f"\nEncoding {len(data)} rows for mode: {input_mode} (max_len={max_len})")
        embeddings = bertEncode(
            data['input'].tolist(),
            max_len=max_len,
            batch_size=32,
        )
        # embeddings is a list of numpy arrays
        data['bert_embs'] = embeddings

        # Keep only tweet_id + embeddings and JSON-encode for SQLite
        processed_data = data[['tweet_id', 'bert_embs']].copy()
        processed_data['bert_embs'] = processed_data['bert_embs'].progress_apply(
            lambda x: json.dumps(x.tolist() if isinstance(x, np.ndarray) else x)
        )

        print("\nProcessed data sample:")
        display(processed_data.head())

        embeddings_db_path = f'{dataset_path}_bertlarge_embeddings_{input_mode}_timelock{SEED}.db'
        print(f"\nSaving embeddings to: {embeddings_db_path} (table: {input_mode})")

        out_con = sqlite3.connect(embeddings_db_path)
        processed_data.to_sql(input_mode, out_con, index=False, if_exists='replace')
        out_con.close()

    con.close()


Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid_data.db

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Example constructed input:


'[Emotion:neutral Stance:AGAINST Target:Warren] [Emotion:neutral Stance:AGAINST Target:Pelosi] [Emotion:neutral Stance:NEUTRAL Target:the_Judicial_Branch] [Emotion:neutral Stance:NEUTRAL Target:the_Judiciary_amp] [SEP] First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Encoding 83910 rows for mode: text_emo (max_len=500)


100%|██████████| 83910/83910 [01:02<00:00, 1345.20it/s]


Processed data sample:





Unnamed: 0,tweet_id,bert_embs
0,1223618900044152834,"[-0.4373137950897217, -0.5135035514831543, 0.9..."
1,1223789646989144065,"[-0.11578717827796936, -0.48306822776794434, 0..."
2,1223789928192057346,"[-0.4954860508441925, -0.25309616327285767, 0...."
3,1223789991798636544,"[-0.21727216243743896, -0.47908443212509155, 0..."
4,1223790057133346816,"[-0.2077283412218094, -0.4407166540622711, 0.7..."



Saving embeddings to: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid_bertlarge_embeddings_text_emo_timelock9898.db (table: text_emo)

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,1223618900044152834,1,1223618900044152834,1,1223618900044152834,"First Sen. Warren, & now Speaker Pelosi questi...",neutral emotion & AGAINST stance towards Warre...,2020-02-01T14:47:34.000Z,2020-02-01T14:47:34.000Z,0.0,1,1,1,1,1,1,1,1,1
1,train,1223789646989144065,1,1223789646989144065,1,1223789646989144065,Statement on the final installment of the CNN/...,neutral emotion & NEUTRAL stance towards CNN;n...,2020-02-02T02:06:03.000Z,2020-02-02T02:06:03.000Z,0.0,1,1,1,1,1,1,1,1,1
2,train,1223789928192057346,0,1223789646989144065,1,1223789646989144065,@NateSilver538,neutral emotion & FAVOR stance towards @NateSi...,2020-02-02T02:07:10.000Z,2020-02-02T02:06:03.000Z,1.116667,0,0,1,1,1,1,1,1,1
3,train,1223789991798636544,0,1223789646989144065,1,1223789646989144065,that respondents name was Shmoe Shmiden,neutral emotion & NEUTRAL stance towards Shmoe...,2020-02-02T02:07:25.000Z,2020-02-02T02:06:03.000Z,1.366667,0,0,1,1,1,1,1,1,1
4,train,1223790057133346816,0,1223789646989144065,1,1223789646989144065,Read: Bernie polled too highly and they looked...,neutral emotion & AGAINST stance towards Bernie,2020-02-02T02:07:41.000Z,2020-02-02T02:06:03.000Z,1.633333,0,0,1,1,1,1,1,1,1


Example raw text:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'

Example constructed input:


'First Sen. Warren, & now Speaker Pelosi question the legitimacy of Chief Justice Roberts & the Judicial Branch. This is all Democrats have. Their case fell apart so they will attack the legitimacy & credibility of everything in sight: the 2020 election, the Judiciary, & more.'


Encoding 83910 rows for mode: text_only (max_len=500)


100%|██████████| 83910/83910 [01:02<00:00, 1336.06it/s]


Processed data sample:





Unnamed: 0,tweet_id,bert_embs
0,1223618900044152834,"[-0.404148668050766, -0.5592175126075745, 0.79..."
1,1223789646989144065,"[-0.029826264828443527, -0.49982330203056335, ..."
2,1223789928192057346,"[-0.6144954562187195, -0.3631715178489685, 0.8..."
3,1223789991798636544,"[-0.19181376695632935, -0.6705280542373657, 0...."
4,1223790057133346816,"[-0.2749437093734741, -0.6386899948120117, 0.2..."



Saving embeddings to: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/fibvid_bertlarge_embeddings_text_only_timelock9898.db (table: text_only)
Processing database: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_data.db

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_emo):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example constructed input:


'[Emotion:neutral Stance:FAVOR Target:Mike_Huckabee] [Emotion:neutral Stance:NEUTRAL Target:Weds] [SEP] I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Encoding 103672 rows for mode: text_emo (max_len=500)


100%|██████████| 103672/103672 [01:16<00:00, 1351.04it/s]



Processed data sample:


Unnamed: 0,tweet_id,bert_embs
0,46516,"[-0.2892484664916992, -0.6020144820213318, 0.7..."
1,378,"[-0.2403288632631302, -0.38780397176742554, 0...."
2,46541,"[-0.36535292863845825, -0.6550670862197876, 0...."
3,88,"[-0.06262066215276718, -0.5027342438697815, 0...."
4,13397,"[-0.23094820976257324, -0.6095700263977051, 0...."



Saving embeddings to: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_bertlarge_embeddings_text_emo_timelock9898.db (table: text_emo)

Raw data preview for /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts (text_only):


Unnamed: 0,SPLIT,tweet_id,is_root,root_node,label,parent_id,text,target_emotion_stance,create_date,root_created,time_elapsed,0m,1m,15m,20m,60m,90m,24h,48h,gt_48h
0,train,46516,0,378,0,378,I might get an interview with Mike Huckabee ab...,neutral emotion & FAVOR stance towards Mike Hu...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,0,1,1,1,1,1,1,1,1
1,train,378,1,378,0,378,Says Mike Huckabee appeared in diabetes infome...,neutral emotion & AGAINST stance towards Mike ...,2008-05-05 15:20:48,2008-05-05 15:20:48,0.0,1,1,1,1,1,1,1,1,1
2,train,46541,0,378,0,378,@dporter THANKS! I may get to interview Mike H...,neutral emotion & FAVOR stance towards @dporte...,2008-05-06 12:27:06,2008-05-05 15:20:48,1266.3,0,0,0,0,0,0,1,1,1
3,train,88,1,88,0,88,"""The law says that mental health must be treat...",neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,1,1,1,1,1,1,1,1,1
4,train,13397,0,88,0,88,pages upon pages of provision to require insur...,neutral emotion & NEUTRAL stance towards,2008-10-01 17:11:31,2008-10-01 17:11:31,0.0,0,1,1,1,1,1,1,1,1


Example raw text:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'

Example constructed input:


'I might get an interview with Mike Huckabee about his weight loss/diabetes issue on Weds morning. Stay tuned!'


Encoding 103672 rows for mode: text_only (max_len=500)


100%|██████████| 103672/103672 [01:18<00:00, 1328.43it/s]



Processed data sample:


Unnamed: 0,tweet_id,bert_embs
0,46516,"[-0.2708462178707123, -0.6541179418563843, 0.7..."
1,378,"[-0.08169658482074738, -0.4504814147949219, 0...."
2,46541,"[-0.2693890333175659, -0.6346859931945801, 0.7..."
3,88,"[-0.037797488272190094, -0.27148136496543884, ..."
4,13397,"[-0.24520893394947052, -0.728566586971283, 0.7..."



Saving embeddings to: /content/drive/MyDrive/PhD/Study2_review/FibVid_EY_KC/Production/data/ts_bertlarge_embeddings_text_only_timelock9898.db (table: text_only)


In [None]:
#takes 4hrs + to process