# Text Preprocessing

This notebook covers:
1. Load raw data
2. Text cleaning functions
3. Tokenization logic
4. Vocabulary creation
5. Padding / truncation
6. Save processed data

## 1. Load raw data

In [1]:
from pathlib import Path

import pandas as pd

DATA_DIR = Path("../data")

train_df = pd.read_csv(DATA_DIR / "train.txt", sep=";", header=None, names=["text", "emotion"])
test_df = pd.read_csv(DATA_DIR / "test.txt", sep=";", header=None, names=["text", "emotion"])

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df.head()

Train shape: (16000, 2)
Test shape: (2000, 2)


Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


## 2. Text cleaning functions

In [2]:
import re


def clean_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


sample_text = train_df.loc[0, "text"]
print("Raw:", sample_text)
print("Clean:", clean_text(sample_text))

Raw: i didnt feel humiliated
Clean: i didnt feel humiliated


## 3. Tokenization logic

In [3]:
def tokenize(text: str) -> list[str]:
    return text.split()


train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

train_df["tokens"] = train_df["clean_text"].apply(tokenize)
test_df["tokens"] = test_df["clean_text"].apply(tokenize)

train_df[["text", "clean_text", "tokens", "emotion"]].head()

Unnamed: 0,text,clean_text,tokens,emotion
0,i didnt feel humiliated,i didnt feel humiliated,"[i, didnt, feel, humiliated]",sadness
1,i can go from feeling so hopeless to so damned...,i can go from feeling so hopeless to so damned...,"[i, can, go, from, feeling, so, hopeless, to, ...",sadness
2,im grabbing a minute to post i feel greedy wrong,im grabbing a minute to post i feel greedy wrong,"[im, grabbing, a, minute, to, post, i, feel, g...",anger
3,i am ever feeling nostalgic about the fireplac...,i am ever feeling nostalgic about the fireplac...,"[i, am, ever, feeling, nostalgic, about, the, ...",love
4,i am feeling grouchy,i am feeling grouchy,"[i, am, feeling, grouchy]",anger


## 4. Vocabulary creation

In [4]:
from collections import Counter


def build_vocab(token_lists: pd.Series, min_freq: int = 2) -> dict[str, int]:
    counter = Counter(token for tokens in token_lists for token in tokens)
    kept_tokens = [tok for tok, freq in counter.items() if freq >= min_freq]
    kept_tokens = sorted(kept_tokens, key=lambda t: (-counter[t], t))

    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token in kept_tokens:
        vocab[token] = len(vocab)
    return vocab


vocab = build_vocab(train_df["tokens"], min_freq=2)
print(f"Vocab size: {len(vocab)}")
print("First 20 tokens:", list(vocab.keys())[:20])

Vocab size: 7401
First 20 tokens: ['<PAD>', '<UNK>', 'i', 'feel', 'and', 'to', 'the', 'a', 'feeling', 'that', 'of', 'my', 'in', 'it', 'like', 'so', 'for', 'im', 'me', 'but']


## 5. Padding / truncation

In [5]:
def encode_tokens(tokens: list[str], vocab_map: dict[str, int]) -> list[int]:
    unk_id = vocab_map["<UNK>"]
    return [vocab_map.get(token, unk_id) for token in tokens]


def pad_or_truncate(ids: list[int], max_len: int, pad_id: int = 0) -> list[int]:
    if len(ids) > max_len:
        return ids[:max_len]
    return ids + [pad_id] * (max_len - len(ids))


train_lengths = train_df["tokens"].apply(len)
max_len = int(train_lengths.quantile(0.95))
print(f"Max length (95th percentile): {max_len}")

train_df["input_ids"] = train_df["tokens"].apply(lambda t: encode_tokens(t, vocab))
test_df["input_ids"] = test_df["tokens"].apply(lambda t: encode_tokens(t, vocab))

train_df["input_ids"] = train_df["input_ids"].apply(lambda ids: pad_or_truncate(ids, max_len))
test_df["input_ids"] = test_df["input_ids"].apply(lambda ids: pad_or_truncate(ids, max_len))

train_df[["tokens", "input_ids"]].head()

Max length (95th percentile): 41


Unnamed: 0,tokens,input_ids
0,"[i, didnt, feel, humiliated]","[2, 139, 3, 686, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[i, can, go, from, feeling, so, hopeless, to, ...","[2, 40, 101, 60, 8, 15, 497, 5, 15, 3642, 557,..."
2,"[im, grabbing, a, minute, to, post, i, feel, g...","[17, 3239, 7, 1165, 5, 288, 2, 3, 496, 448, 0,..."
3,"[i, am, ever, feeling, nostalgic, about, the, ...","[2, 24, 165, 8, 671, 27, 6, 4517, 2, 59, 48, 9..."
4,"[i, am, feeling, grouchy]","[2, 24, 8, 1075, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


## 6. Save processed data

In [6]:
import json


def ids_to_string(ids: list[int]) -> str:
    return " ".join(str(i) for i in ids)


output_dir = DATA_DIR / "processed"
output_dir.mkdir(parents=True, exist_ok=True)

train_out = train_df.copy()
test_out = test_df.copy()

train_out["input_ids"] = train_out["input_ids"].apply(ids_to_string)
test_out["input_ids"] = test_out["input_ids"].apply(ids_to_string)

train_out[["input_ids", "emotion"]].to_csv(output_dir / "train_processed.csv", index=False)
test_out[["input_ids", "emotion"]].to_csv(output_dir / "test_processed.csv", index=False)

with open(output_dir / "vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab, f, indent=2)

metadata = {"max_len": max_len, "vocab_size": len(vocab)}
with open(output_dir / "metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print("Saved:")
print("-", output_dir / "train_processed.csv")
print("-", output_dir / "test_processed.csv")
print("-", output_dir / "vocab.json")
print("-", output_dir / "metadata.json")

Saved:
- ../data/processed/train_processed.csv
- ../data/processed/test_processed.csv
- ../data/processed/vocab.json
- ../data/processed/metadata.json
