# Introduction

This Colab explores the impact of different text embeddings on generalization in a text classification task. I train models on a 1k labelled data points and test on 25k (data from [News Headlines Dataset For Sarcasm Detection](https://www.kaggle.com/code/nilanml/detecting-sarcasm-using-different-embeddings)) to emphasize the role of data quality over extensive model tuning. I compare the accuracy scores achieved with various embeddings to identify which best captures the underlying data structure and promotes generalization.



# Data Preparation



## Import Libraries

In [None]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import warnings
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from huggingface_hub import login
from google.colab import userdata
login(token=userdata.get('HF_TOKEN'))

## Train / Test Split

In [None]:
# Pull data from kaggle
path = kagglehub.dataset_download("rmisra/news-headlines-dataset-for-sarcasm-detection")
df = pd.read_json(path + '/Sarcasm_Headlines_Dataset.json', lines=True)
df = df[['headline', 'is_sarcastic']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['headline'], df['is_sarcastic'], test_size=0.9625, random_state=42
)

## Define Models and Hyperparameter Search Strategy

In [None]:
# Define models and their parameter grids
models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "param_grid": {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(random_state=42, max_iter=1000),
        "param_grid": {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "param_grid": {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0],
            'max_depth': [3, 4, 5],
            'min_samples_split': [2, 5, 10]
        }
    }
}

# Function to train and evaluate a model
def train_and_evaluate_model(model, param_grid, X_train, y_train, X_test, y_test):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=10,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        random_state=42,
    )
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    accuracy = accuracy_score(y_test, best_model.predict(X_test))
    print(f"Accuracy of {model.__class__.__name__}: {accuracy}")
    print(f"Best parameters: {random_search.best_params_}")
    return accuracy, best_model

# Initialize a list to store model results
model_results = []

# Tokenizer Embedding

In [None]:
# Text Preprocessing
tokenizer = Tokenizer(num_words=40000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=384)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=384)

# --- Tokenized and Padded Data ---
for model_name, model_data in models.items():
    print(f"Training and evaluating {model_name}...")
    accuracy, best_model = train_and_evaluate_model(
        model_data["model"],
        model_data["param_grid"],
        X_train_padded,
        y_train,
        X_test_padded,
        y_test
    )
    model_results.append({
        "model_name": model_name,
        "embedding_strategy": "Tokenized and Padded",
        "accuracy": accuracy,
        "best_model": best_model
    })
    print("-" * 80)

# Word2Vec Embedding

In [None]:
# Word2Vec Embeddings
word_vectors = api.load("word2vec-google-news-300")

def word2vec_embedding(texts, word_vectors):
    """Embeds text using pre-trained Word2Vec."""
    sentences = [text.split() for text in texts]
    embedded_texts = []
    for sentence in sentences:
        sentence_embedding = []
        for word in sentence:
            if word in word_vectors.key_to_index:
                sentence_embedding.append(word_vectors[word])
        if sentence_embedding:
            embedded_texts.append(np.mean(np.array(sentence_embedding), axis=0))
        else:
            embedded_texts.append(np.zeros(word_vectors.vector_size))
    return np.array(embedded_texts)

X_train_embedded = word2vec_embedding(X_train, word_vectors)
X_test_embedded = word2vec_embedding(X_test, word_vectors)

for model_name, model_data in models.items():
    print(f"Training and evaluating {model_name}...")
    accuracy, best_model = train_and_evaluate_model(
        model_data["model"],
        model_data["param_grid"],
        X_train_embedded,
        y_train,
        X_test_embedded,
        y_test
    )
    model_results.append({
        "model_name": model_name,
        "embedding_strategy": "Word2Vec",
        "accuracy": accuracy,
        "best_model": best_model
    })
    print("-" * 80)

# SentenceTransformer Embeddings

In [None]:
# SentenceTransformer Embeddings
sentence_models = [
    "Alibaba-NLP/gte-Qwen2-7B-instruct",
    "all-distilroberta-v1",
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "bert-base-uncased",
    "jinaai/jina-embeddings-v3",
    "multi-qa-mpnet-base-dot-v1"
]

for sentence_model_name in sentence_models:
    print("-" * 120)
    print(f"Using SentenceTransformer model: {sentence_model_name}")
    model = SentenceTransformer(sentence_model_name, trust_remote_code=True)
    X_train_embeddings = model.encode(X_train.tolist())
    X_test_embeddings = model.encode(X_test.tolist())
    for model_name, model_data in models.items():
        print(f"Training and evaluating {model_name}...")
        accuracy, best_model = train_and_evaluate_model(
            model_data["model"],
            model_data["param_grid"],
            X_train_embeddings,
            y_train,
            X_test_embeddings,
            y_test
        )
        model_results.append({
            "model_name": model_name,
            "embedding_strategy": f"SentenceTransformer: {sentence_model_name}",
            "accuracy": accuracy,
            "best_model": best_model
        })
        print("-" * 80)

# Create a pandas DataFrame from the model_results list
results_df = pd.DataFrame(model_results)

# Mistral Embeddings

In [None]:
def get_mistral_embedding(text, model, tokenizer, device):
    """
    Generates an embedding for the input text using the hidden layers of a Mistral 7B model.

    Args:
        text: The input text (string).
        model: The Mistral model.
        tokenizer: The Mistral tokenizer.
        device: The device to run the model on (e.g., 'cuda' or 'cpu').

    Returns:
        A PyTorch tensor representing the embedding of the text.
        Returns None if the text is empty after tokenization.
    """

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Check if input_ids is empty
    if inputs.input_ids.size(1) == 0:
        print("Warning: Empty input after tokenization.")
        return None

    # Get the model's output, including hidden states
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Extract the hidden states
    hidden_states = outputs.hidden_states[-1]

    # Take the mean of the hidden states across tokens
    embedding = torch.mean(hidden_states, dim=1)

    return embedding.squeeze()

In [None]:
# Load the Mistral 7B model and tokenizer
model_name = "mistralai/Mistral-7B-v0.1"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load the model (with quantization)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Mistral LLM Embeddings
X_train_mistral_embeddings = []
X_test_mistral_embeddings = []

for text in X_train:
    embedding = get_mistral_embedding(text, model, tokenizer, device=None)
    if embedding is not None:
        X_train_mistral_embeddings.append(embedding.cpu().numpy())

for text in X_test:
    embedding = get_mistral_embedding(text, model, tokenizer, device=None)
    if embedding is not None:
        X_test_mistral_embeddings.append(embedding.cpu().numpy())

X_train_mistral_embeddings = np.array(X_train_mistral_embeddings)
X_test_mistral_embeddings = np.array(X_test_mistral_embeddings)

for model_name, model_data in models.items():
    print(f"Training and evaluating {model_name}...")
    accuracy, best_model = train_and_evaluate_model(
        model_data["model"],
        model_data["param_grid"],
        X_train_mistral_embeddings,
        y_train,
        X_test_mistral_embeddings,
        y_test
    )
    model_results.append({
        "model_name": model_name,
        "embedding_strategy": "Mistral LLM Embeddings",
        "accuracy": accuracy,
        "best_model": best_model
    })
    print("-" * 80)