# Computing RoBERTa embeddings for Stemming

In [None]:
import pandas as pd

In [None]:
norm_df = pd.read_csv('new_normalized_headlines.csv')

In [None]:
norm_df.head()

Unnamed: 0,url,news,headline,cleaned,lemmatized,stemmed,word_count
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Fox News,jack carr recalls eisenhower's d-day memo 'gre...,jack carr recalls eisenhowers dday memo great ...,jack carr recall eisenhower dday memo great no...,jack carr recal eisenhow dday memo great nobl ...,9
1,https://www.foxnews.com/entertainment/bruce-wi...,Fox News,"bruce willis, demi moore avoided one thing co-...",bruce willis demi moore avoided one thing copa...,bruce willis demi moore avoided one thing copa...,bruce willi demi moor avoid one thing copar da...,10
2,https://www.foxnews.com/politics/blinken-meets...,Fox News,"blinken meets qatar pm, says israeli actions '...",blinken meets qatar pm says israeli actions re...,blinken meet qatar pm say israeli action retal...,blinken meet qatar pm say isra action retali d...,11
3,https://www.foxnews.com/entertainment/emily-bl...,Fox News,emily blunt says ‘toes curl’ people tell kids ...,emily blunt says toes curl people tell kids wa...,emily blunt say toe curl people tell kid want ...,emili blunt say toe curl peopl tell kid want a...,15
4,https://www.foxnews.com/media/the-view-co-host...,Fox News,"'the view' co-host, cnn commentator ana navarr...",the view cohost cnn commentator ana navarro ho...,the view cohost cnn commentator ana navarro ho...,the view cohost cnn comment ana navarro host n...,12


In [None]:
from sklearn.model_selection import train_test_split
# Split the data into train and test samples
X_train, X_test, y_train, y_test = train_test_split(norm_df['stemmed'], norm_df['news'], test_size=0.2, random_state=42, stratify=norm_df['news'])

In [None]:
# 3. Convert labels to binary (1 = FoxNews, 0 = NBC)
y_train = y_train.apply(lambda x: 1 if x == 'Fox News' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'Fox News' else 0)

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = model.to(device)

Using device: cuda


In [None]:
def get_embeddings(texts, tokenizer, model, max_len=128):
    embeddings = []

    for text in texts:
        # Tokenize
        inputs = tokenizer(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Move to the same device as the model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            # Extract <s> token embedding, move to CPU, convert to NumPy
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_embedding)

    return embeddings

# Get train and test embeddings (this may take awhile)
train_embeddings = get_embeddings(X_train, tokenizer, model)
test_embeddings = get_embeddings(X_test, tokenizer, model)

In [None]:
print(len(train_embeddings))

3043


In [None]:
import numpy as np
print(np.array(train_embeddings).shape)

(3043, 768)


In [None]:
print(np.array(test_embeddings).shape)

(761, 768)


In [None]:
import numpy as np

# Convert list of embeddings to a NumPy array
train_embeddings_array = np.array(train_embeddings)
test_embeddings_array = np.array(test_embeddings)

In [None]:
train_embeddings_array.shape

(3043, 768)

In [None]:
np.save('RoBERTa_train_stem_embeddings.npy', train_embeddings_array)
np.save('RoBERTa_test_stem_embeddings.npy', test_embeddings_array)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)
np.save('y_train_roberta_stem.npy', y_train)
np.save('y_test_roberta_stem.npy', y_test)