# Data Preprocessing

This pipeline transforms raw text data into a token format for Transformer-based encoder models (BERT/RoBERTa) and decoder model for detecting sentiment/sarcasm texts.

## Import Libraries

In [None]:
import re
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight


from transformers import AutoTokenizer

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Data Loading

In [None]:
dataset = load_dataset("unswnlporg/BESSTIE")
train = dataset["train"].to_pandas()
val = dataset["validation"].to_pandas()

train.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/17760 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2428 [00:00<?, ? examples/s]

Unnamed: 0,text,label,variety,source,task
0,This was one of the best dishes I've EVER had!...,1,en-AU,Google,Sentiment
1,This Mexican restaurant in Penrith is a great ...,1,en-AU,Google,Sentiment
2,"This was not to bad, I ordered the big pork ri...",1,en-AU,Google,Sentiment
3,Clean cool and a nice smaller casino to check ...,1,en-AU,Google,Sentiment
4,Well set out. Great areas to enjoy. Good food ...,1,en-AU,Google,Sentiment


```
Dataset shapes (train, validation):
(17760, 6) (2428, 5)
```

Columns in dataset:
Index(['text', 'label', 'variety', 'source', 'task', 'text_len'], dtype='object')

# Sentiment vs Sarcasm Task splitting

In [None]:
print("\nLabel distribution by tasks:")
display(pd.crosstab(train["label"], train["task"]))


Label distribution by tasks:


task,Sarcasm,Sentiment
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7619,4473
1,1275,4393


# Variety

In [None]:

all_varieties = sorted(train['variety'].unique())

# {'en-AU': 0, 'en-GB': 1, 'en-US': 2, ...}
variety_map = {v: i for i, v in enumerate(all_varieties)}

print("Variety Mapping (Save):")
print(variety_map)


Variety Mapping (Save):
{'en-AU': 0, 'en-IN': 1, 'en-UK': 2}


In [None]:
# For Sentiment
sent_train = train[train['task'] == 'Sentiment'].copy()
sent_val = val[val['task'] == 'Sentiment'].copy()

# For Sarcasm
sarc_train = train[train['task'] == 'Sarcasm'].copy()
sarc_val = val[val['task'] == 'Sarcasm'].copy()

# Cleaning

* HTML & URL Removal: Web-scraped data (Google/Reddit) often contains artifacts

*   Specific Reddit user mentions removal

*  Hashtag Handling: Removed the # symbol but kept the word ( #politics $\to$ politics). The content is valuable, but the symbol is noise

*   Skipped the standard text.lower() step. In Sarcasm and Sentiment, capitalization is a massive feature. Bert model can read uppercases.












In [None]:
def clean_text(text):
    if pd.isna(text) or text == "":
        return ""

    text = str(text)

    # A. Remove User Mentions (Specific to Reddit)
    text = re.sub(r"(u/|@)\w+", "[USER]", text)

    # B. Remove HTML entities (Corrected regex)
    text = re.sub(r"&\w+;", " ", text)

    # C. Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "[URL]", text)

    # D. Handle Hashtags (Remove #, keep word)
    text = re.sub(r"#(\w+)", r"\1", text)

    # E. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
def apply_cleaning(df, name="Dataset"):

    df = df.copy()

    before = len(df)
    df['text_clean'] = df['text'].apply(clean_text)
    df_clean = df[df['text_clean'].notna()]
    after = len(df_clean)
    removed = before - after

    print(f"[{name}]")
    print(f"  Before: {before:,} rows")
    print(f"  After:  {after:,} rows")
    print(f"  Removed: {removed} empty rows")

    return df_clean


print("\n--- SENTIMENT ---")
sent_train = apply_cleaning(sent_train, "Sentiment Train")
sent_val = apply_cleaning(sent_val, "Sentiment Val")

print("\n--- SARCASM ---")
sarc_train = apply_cleaning(sarc_train, "Sarcasm Train")
sarc_val = apply_cleaning(sarc_val, "Sarcasm Val")



--- SENTIMENT ---
[Sentiment Train]
  Before: 8,866 rows
  After:  8,866 rows
  Removed: 0 empty rows
[Sentiment Val]
  Before: 1,212 rows
  After:  1,212 rows
  Removed: 0 empty rows

--- SARCASM ---
[Sarcasm Train]
  Before: 8,894 rows
  After:  8,894 rows
  Removed: 0 empty rows
[Sarcasm Val]
  Before: 1,216 rows
  After:  1,216 rows
  Removed: 0 empty rows


## Train / Val / Test Splitting

The paper's test set is ~2,500 rows (Table 5). The Hugging Face validation set might actually be the Test set. We use train to create a new validation set and use the provided validation purely as test to match the paper's volume.

In [None]:
# Train set splits into train/val

def internal_split(df):
    train_split, val_split = train_test_split(
        df,
        test_size=0.10,  # 10% for validation
        random_state=42,
        stratify=df['label']
    )
    return train_split, val_split

# apply
train_sentiment, val_sentiment = internal_split(sent_train)
train_sarcasm, val_sarcasm = internal_split(sarc_train)

test_sentiment = sent_val
test_sarcasm   = sarc_val

In [None]:

print("--- SENTIMENT ---")
print(f"Train: {len(train_sentiment)} ")
print(f"Val:   {len(val_sentiment)}   ")
print(f"Test:  {len(test_sentiment)}  ")

print("\n--- SARCASM ---")
print(f"Train: {len(train_sarcasm)}")
print(f"Val:   {len(val_sarcasm)}")
print(f"Test:  {len(test_sarcasm)}")

--- SENTIMENT ---
Train: 7979 
Val:   887   
Test:  1212  

--- SARCASM ---
Train: 8004
Val:   890
Test:  1216


# Tokenization / Padding

Convert human text into numerical Input IDs.


Padding: Sequences shorter than 512
tokens were padded with [PAD] (zeros) to ensure uniform matrix dimensions for batch processing.




# Bert Encoder

In [None]:
ENCODER_MODEL_NAME = "bert-base-cased"
OUTPUT_DIR = "/content/drive/MyDrive/DNLP/data/processed_data_final"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# ENCODER (BERT, RoBERTa)

def prepare_encoder_data(train_df, val_df, test_df, model_name,variety_map):
    print(f"Tokenizing for Encoder model: {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    datasets = {'train': train_df, 'val': val_df, 'test': test_df}
    processed_data = {}

    for split_name, df in datasets.items():
        # Тokenization

        encodings = tokenizer(
            df['text_clean'].tolist(),
            truncation=True,
            padding='max_length',
            max_length=256,
            return_tensors='pt'
        )

        # 2.Variety
        # ['en-US', 'en-AU'] -> [2, 0]

        variety_ids = df['variety'].map(variety_map).fillna(-1).astype(int).values

        processed_data[split_name] = {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': torch.tensor(df['label'].values, dtype=torch.long),
            'variety': torch.tensor(variety_ids, dtype=torch.long)
        }

    return processed_data


# Apply separetly for tasks
print("Processing Encoders")
bert_sentiment_data = prepare_encoder_data(train_sentiment, val_sentiment, test_sentiment, ENCODER_MODEL_NAME,variety_map)

bert_sarcasm_data   = prepare_encoder_data(train_sarcasm, val_sarcasm, test_sarcasm, ENCODER_MODEL_NAME,variety_map)

Processing Encoders
Tokenizing for Encoder model: bert-base-cased...
Tokenizing for Encoder model: bert-base-cased...


In [None]:
def inspect_encoder_data(data_dict, task_name, model_name):

    print(f"ENCODER DATA: {task_name} using {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    for split in ['train', 'val', 'test']:
        split_data = data_dict[split]

        # 1. get tensors
        input_ids = split_data['input_ids']
        attention_mask = split_data['attention_mask']
        labels = split_data['labels']

        print(f"\n Split: {split.upper()} ---")

        # 2. check shapes
        print(f"Input IDs shape:      {input_ids.shape}")
        print(f"Attention Mask shape: {attention_mask.shape}")
        print(f"Labels shape:         {labels.shape}")

        if len(input_ids) != len(labels):
            print("Mismatch between inputs and labels count!")
        else:
            print("Counts match.")

        # 3. Random sample decoder

        idx = random.randint(0, len(input_ids) - 1)

        print(f"\n[Sample Data at index {idx}]")


        # As a result example print first 10 and last 5 ids
        ids = input_ids[idx]
        print(f"Tensor IDs (shortened): {ids[:10].tolist()} ... {ids[-5:].tolist()}")

        # Б) Decode to text
        decoded_text = tokenizer.decode(ids)
        print(f"Decoded Text: \"{decoded_text}\"")

        print(f"Label: {labels[idx].item()}")



# Sentiment (BERT)
inspect_encoder_data(bert_sentiment_data, "SENTIMENT", "bert-base-cased")

# Sarcasm (BERT)
inspect_encoder_data(bert_sarcasm_data, "SARCASM", "bert-base-cased")

ENCODER DATA: SENTIMENT using bert-base-cased

 Split: TRAIN ---
Input IDs shape:      torch.Size([7979, 256])
Attention Mask shape: torch.Size([7979, 256])
Labels shape:         torch.Size([7979])
Counts match.

[Sample Data at index 5769]
Tensor IDs (shortened): [101, 13832, 18734, 1174, 170, 1374, 8898, 1107, 1303, 1114] ... [0, 0, 0, 0, 0]
Decoded Text: "[CLS] Enjoyed a few drinks in here with friends. Quite lively on a Saturday afternoon with sport on the screens creating a vibrant atmosphere. Ales were of a good quality [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

# Variety cheking

In [None]:
#Checking variety

print("\nChecking Variety tensor...")
sample_var_id = bert_sentiment_data['train']['variety'][0].item()

# (reverse lookup)

id2variety = {i: v for v, i in variety_map.items()}
print(f"ID: {sample_var_id} -> Variety: {id2variety[sample_var_id]}")


Checking Variety tensor...
ID: 1 -> Variety: en-IN


In [None]:
import random

def inspect_variety_metadata(data_dict, variety_map, task_name):

    print(f" VARIETY FIELD: {task_name}")

    id2variety = {i: name for name, i in variety_map.items()}

    for split in ['train', 'val', 'test']:
        split_data = data_dict[split]

        # 1. Key exist or not
        if 'variety' not in split_data:
            print(f"ERROR:'variety' key MISSING in {split}!")
            return

        var_tensor = split_data['variety']
        input_tensor = split_data['input_ids']

        print(f"\n--- Split: {split.upper()} ---")

        # 2. Shape cheking

        print(f"Variety Shape: {var_tensor.shape}")

        if len(var_tensor) != len(input_tensor):
            print(f"ERROR: Shape mismatch! Text: {len(input_tensor)}, Variety: {len(var_tensor)}")
        else:
            print(f"Length matches input_ids.")

        # 3. Sample
        idx = random.randint(0, len(var_tensor) - 1)
        val_id = var_tensor[idx].item()

        # If ID = -1, then warning
        if val_id == -1:
             print(f"Found -1 (Unknown variety) at index {idx}")
        else:
            country_name = id2variety.get(val_id, "UNKNOWN")
            print(f"Sample Index {idx}: ID {val_id} -> means '{country_name}'")



inspect_variety_metadata(bert_sentiment_data, variety_map, "SENTIMENT")
inspect_variety_metadata(bert_sarcasm_data, variety_map, "SARCASM")

 VARIETY FIELD: SENTIMENT

--- Split: TRAIN ---
Variety Shape: torch.Size([7979])
Length matches input_ids.
Sample Index 7683: ID 0 -> means 'en-AU'

--- Split: VAL ---
Variety Shape: torch.Size([887])
Length matches input_ids.
Sample Index 309: ID 1 -> means 'en-IN'

--- Split: TEST ---
Variety Shape: torch.Size([1212])
Length matches input_ids.
Sample Index 806: ID 1 -> means 'en-IN'
 VARIETY FIELD: SARCASM

--- Split: TRAIN ---
Variety Shape: torch.Size([8004])
Length matches input_ids.
Sample Index 5748: ID 1 -> means 'en-IN'

--- Split: VAL ---
Variety Shape: torch.Size([890])
Length matches input_ids.
Sample Index 359: ID 2 -> means 'en-UK'

--- Split: TEST ---
Variety Shape: torch.Size([1216])
Length matches input_ids.
Sample Index 1207: ID 2 -> means 'en-UK'


# Robert Encoder

In [None]:
ROBERTA_MODEL = "roberta-base"

print(f"\n Starting processing for {ROBERTA_MODEL}...")

roberta_sentiment_data = prepare_encoder_data(
    train_sentiment, val_sentiment, test_sentiment, ROBERTA_MODEL,variety_map
)

roberta_sarcasm_data = prepare_encoder_data(
    train_sarcasm, val_sarcasm, test_sarcasm, ROBERTA_MODEL,variety_map
)



 Starting processing for roberta-base...
Tokenizing for Encoder model: roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing for Encoder model: roberta-base...


# Decoder

In [None]:
#Formatting text to Prompts

def format_decoder_prompt(text, task_name):

    if task_name == "Sentiment":
        # Prompt  output "1" or "0"
        return f"Generate the sentiment of the given text. 1 for positive, 0 for negative. Do not give an explanation.Text: {text} Answer:"

    elif task_name == "Sarcasm":
        return f"Predict if the given text is sarcastic. 1 if sarcastic, 0 if not. Do not give an explanation. Text: {text} Answer:"

    return text


def prepare_decoder_data(train_df, val_df, test_df, task_name):
    print(f"Formatting prompts for Decoder (Task: {task_name})...")

    datasets = {'train': train_df, 'val': val_df, 'test': test_df}
    processed_data = {}

    for split_name, df in datasets.items():
        # 1. Create Input (prompt)
        prompts = df['text_clean'].apply(lambda x: format_decoder_prompt(x, task_name)).tolist()

        # 2. Create Output (Target Text)
        # convert numbers 0 and 1 to strings
        targets = df['label'].astype(str).tolist()

        # List of Strings
        processed_data[split_name] = {
            'prompts': prompts,
            'targets': targets,
            'original_labels': df['label'].values
        }

    return processed_data


# apply
print("\n Processing Decoders ")
gen_sentiment_data = prepare_decoder_data(train_sentiment, val_sentiment, test_sentiment, "Sentiment")

gen_sarcasm_data  = prepare_decoder_data(train_sarcasm, val_sarcasm, test_sarcasm, "Sarcasm")


 Processing Decoders 
Formatting prompts for Decoder (Task: Sentiment)...
Formatting prompts for Decoder (Task: Sarcasm)...


In [None]:
#  Print example results

def inspect_decoder_data(data_dict, task_name):
    print(f"\n")
    print(f" Inspecting: {task_name}")

    # Cheking for all sets (train, val, test)
    for split in ['train', 'val', 'test']:
        split_data = data_dict[split]


        n_prompts = len(split_data['prompts'])
        n_targets = len(split_data['targets'])

        print(f"\n Split: {split.upper()} ")
        print(f"Rows count: {n_prompts}")

        # Check length (should be True)
        if n_prompts != n_targets:
            print(f"Mismatch! Prompts: {n_prompts}, Targets: {n_targets}")
        else:
            print(f"Lengths match.")

        # 2. Random example
        if n_prompts > 0:
            idx = random.randint(0, n_prompts - 1)
            print(f"\n[Sample Data at index {idx}]")
            print(f"PROMPT:  {split_data['prompts'][idx]}")
            print(f"TARGET:  '{split_data['targets'][idx]}' (Type: {type(split_data['targets'][idx])})")
            print(f"Raw Label: {split_data['original_labels'][idx]}")


inspect_decoder_data(gen_sentiment_data, "SENTIMENT TASK")
inspect_decoder_data(gen_sarcasm_data, "SARCASM TASK")



 Inspecting: SENTIMENT TASK

 Split: TRAIN 
Rows count: 7979
Lengths match.

[Sample Data at index 5206]
PROMPT:  Generate the sentiment of the given text. 1 for positive, 0 for negative. Do not give an explanation.Text: That's not true though. You have to take a moment to analyse the scenes they have. The lighting and the framing is obviously really good. The real problem is nowadays movies are very repetitive in terms of cinematography and music design. People don't want to experiment with the techniques unlike Hollywood productions - the real reason behind people hating bollywood movies Answer:
TARGET:  '0' (Type: <class 'str'>)
Raw Label: 0

 Split: VAL 
Rows count: 887
Lengths match.

[Sample Data at index 243]
PROMPT:  Generate the sentiment of the given text. 1 for positive, 0 for negative. Do not give an explanation.Text: Best place for fast foods under one roof, " Amazing quality with A1 Taste. " Answer:
TARGET:  '1' (Type: <class 'str'>)
Raw Label: 1

 Split: TEST 
Rows cou

# Save results

In [None]:
print(f"\nSaving separate files to '{OUTPUT_DIR}'...")

# 1. Save Encoder Bert Files (Tensors)
torch.save(bert_sentiment_data, os.path.join(OUTPUT_DIR, 'sentiment_bert_encoder_data.pt'))
torch.save(bert_sarcasm_data,   os.path.join(OUTPUT_DIR, 'sarcasm_bert_encoder_data.pt'))

# Saving Robert
torch.save(roberta_sentiment_data, os.path.join(OUTPUT_DIR, 'sentiment_roberta_encoder.pt'))
torch.save(roberta_sarcasm_data,   os.path.join(OUTPUT_DIR, 'sarcasm_roberta_encoder.pt'))

print("RoBERTa inputs saved.")

# 2. Save Decoder Files (Lists of Strings)
torch.save(gen_sentiment_data, os.path.join(OUTPUT_DIR, 'sentiment_decoder_data.pt'))
torch.save(gen_sarcasm_data,   os.path.join(OUTPUT_DIR, 'sarcasm_decoder_data.pt'))

print("All done, 4 clean files ready.")


Saving separate files to '/content/drive/MyDrive/DNLP/data/processed_data_final'...
RoBERTa inputs saved.
All done, 4 clean files ready.


# Compute weights for Labels

**Crosstab of Label x Task**


`The dataset is imbalanced (~86% Non-Sarcastic vs ~14% Sarcastic). A model could cheat by predicting more "0" every time than "1".`



**The weights are saved to be passed to the Loss Function (CrossEntropyLoss) further.**

In [None]:
from sklearn.utils.class_weight import compute_class_weight


OUTPUT_DIR = "/content/drive/MyDrive/DNLP/data/processed_data_final"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def get_and_save_weights(labels_array, task_name):
    print(f"\n--- Calculating Weights for: {task_name} ---")

    # 1. Сalculate weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(labels_array),
        y=labels_array
    )

    # 2. Convert to Tensor (float32 for Loss function)
    weights_tensor = torch.tensor(class_weights, dtype=torch.float)

    # 3. Print how many 0 and 1 each task has
    print(f"Counts: {np.bincount(labels_array)}")
    print(f"Weights: Class 0: {weights_tensor[0]:.4f}, Class 1: {weights_tensor[1]:.4f}")

    # 4. Saving
    save_path = os.path.join(OUTPUT_DIR, f'{task_name.lower()}_weights.pt')
    torch.save(weights_tensor, save_path)
    print(f"Saved to: {save_path}")

    return weights_tensor


# Take labels from train_sentiment и train_sarcasm

# 1. Sentiment Weights
# Expected 1.0, since there already 50/50
sent_weights = get_and_save_weights(train_sentiment['label'].values, "Sentiment")

# 2. Sarcasm Weights
# Expected more values for sarcasm: Class - 1

sarc_weights = get_and_save_weights(train_sarcasm['label'].values, "Sarcasm")




--- Calculating Weights for: Sentiment ---
Counts: [4025 3954]
Weights: Class 0: 0.9912, Class 1: 1.0090
Saved to: /content/drive/MyDrive/DNLP/data/processed_data_final/sentiment_weights.pt

--- Calculating Weights for: Sarcasm ---
Counts: [6857 1147]
Weights: Class 0: 0.5836, Class 1: 3.4891
Saved to: /content/drive/MyDrive/DNLP/data/processed_data_final/sarcasm_weights.pt


# Final outcome:

**1. Data Structuring & Splitting**

Separated Tasks: Split the raw dataset into two distinct streams: Sentiment and Sarcasm.

Correct Splits:

Train: ~8,000 rows per task (Derived from 90% of original Train).

Validation: ~900 rows per task (Derived from 10% of original Train).

Test: ~1,200 rows per task (Used the full HuggingFace validation set to match the paper’s ~2.5k test size).


**2. Data Cleaning**

Applied for all dataframes.

HTML & URL Removal: Web-scraped data (Google/Reddit) often contains artifacts

Specific Reddit user mentions removal and etc...


**3.Preparation for models**

For Encoders (BERT/RoBERTa):

Tokenized inputs (input_ids, attention_mask) padded to 512 tokens.

Verified [CLS] and [SEP] tokens are correctly placed.

For Decoder:

Formatted inputs as Instruction Prompts (e.g., "Predict if the given text is sarcastic...").

Formatted targets as Text Strings ("0" or "1") instead of integers.



**4. Class Imbalance Handling**

Calculated Class Weights separately for each task.

Sentiment: Balanced weights (~1.0).

Sarcasm: Heavily weighted to penalize missing the minority class (14% positive).

Saved weights as tensors for use in CrossEntropyLoss.

**Outcome:** All data files (.pt tensors for Encoders, Decoder, and weight tensors) are saved.

# Task
Load the 'bert-base-cased' tokenizer. Then, define a function to tokenize text without truncation and calculate the length of the resulting token IDs. Apply this function to the 'text_clean' column of `train_sentiment` and `train_sarcasm` dataframes to get their token lengths. After that, filter the sentiment and sarcasm training dataframes to identify records where the token length is greater than 512. Finally, print the number of records longer than 512 tokens for both sentiment and sarcasm datasets and display a few sample records from these filtered dataframes, including their original text and calculated token length.

## Load Tokenizer

### Subtask:
Load the 'bert-base-cased' tokenizer.


**Reasoning**:
To load the 'bert-base-cased' tokenizer, I will use the `AutoTokenizer.from_pretrained()` method as specified in the instructions.



In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print("Tokenizer 'bert-base-cased' loaded successfully.")

Tokenizer 'bert-base-cased' loaded successfully.


**Reasoning**:
The previous code block failed because `AutoTokenizer` was not defined. I need to re-import `AutoTokenizer` from the `transformers` library to resolve this `NameError`.



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print("Tokenizer 'bert-base-cased' loaded successfully.")

Tokenizer 'bert-base-cased' loaded successfully.


**Reasoning**:
I need to define a function that tokenizes text without truncation and calculates the length of the resulting token IDs. Then, I will apply this function to the 'text_clean' column of both `train_sentiment` and `train_sarcasm` dataframes to get their token lengths. The `tokenizer` variable is already loaded from the previous step.



In [None]:
def get_token_length(text):
    # Tokenize the text without truncation to get the full token list
    tokens = tokenizer(text, truncation=False, return_tensors=None)
    # The length of input_ids corresponds to the number of tokens
    return len(tokens['input_ids'])

# Apply the function to sentiment training data
train_sentiment['token_length'] = train_sentiment['text_clean'].apply(get_token_length)
print("Token lengths calculated for train_sentiment.")

# Apply the function to sarcasm training data
train_sarcasm['token_length'] = train_sarcasm['text_clean'].apply(get_token_length)
print("Token lengths calculated for train_sarcasm.")

Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


Token lengths calculated for train_sentiment.
Token lengths calculated for train_sarcasm.


**Reasoning**:
The previous code block failed because the `train_sentiment` and `train_sarcasm` dataframes were not defined in the current execution context. I need to re-initialize these dataframes by including the necessary data loading, cleaning, and splitting steps from previous successful notebook cells before proceeding with the token length calculation.



In [None]:
import re
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Data Loading (from cell -0Xd-TcOzn5F)
dataset = load_dataset("unswnlporg/BESSTIE")
train = dataset["train"].to_pandas()
val = dataset["validation"].to_pandas()

# Sentiment vs Sarcasm Task splitting (from cell DIyMEON2lV2N)
sent_train = train[train['task'] == 'Sentiment'].copy()
sent_val = val[val['task'] == 'Sentiment'].copy()
sarc_train = train[train['task'] == 'Sarcasm'].copy()
sarc_val = val[val['task'] == 'Sarcasm'].copy()

# Cleaning function (from cell bbVYVjqmz_RG)
def clean_text(text):
    if pd.isna(text) or text == "":
        return ""
    text = str(text)
    text = re.sub(r"(u/|@)\w+", "[USER]", text)
    text = re.sub(r"&\w+;", " ", text)
    text = re.sub(r"https?://\S+|www\.\S+", "[URL]", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning function (from cell qrT1-H_mz_Oo)
def apply_cleaning(df, name="Dataset"):
    df = df.copy()
    df['text_clean'] = df['text'].apply(clean_text)
    df_clean = df[df['text_clean'].notna()]
    return df_clean

sent_train = apply_cleaning(sent_train, "Sentiment Train")
sarc_train = apply_cleaning(sarc_train, "Sarcasm Train")

# Train / Val / Test Splitting function (from cell HARmIkbbmAnG)
def internal_split(df):
    train_split, val_split = train_test_split(
        df,
        test_size=0.10,
        random_state=42,
        stratify=df['label']
    )
    return train_split, val_split

# Apply internal split
train_sentiment, val_sentiment = internal_split(sent_train)
train_sarcasm, val_sarcasm = internal_split(sarc_train)

# Original code for token length calculation
def get_token_length(text):
    tokens = tokenizer(text, truncation=False, return_tensors=None)
    return len(tokens['input_ids'])

train_sentiment['token_length'] = train_sentiment['text_clean'].apply(get_token_length)
print("Token lengths calculated for train_sentiment.")

train_sarcasm['token_length'] = train_sarcasm['text_clean'].apply(get_token_length)
print("Token lengths calculated for train_sarcasm.")

Token lengths calculated for train_sentiment.
Token lengths calculated for train_sarcasm.
