## Step 1 - Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the data
datafile_path = "../data/fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)

# Extract relevant columns
df = df[['Text', 'Score']]
df = df.rename(columns={'Text': 'text', 'Score': 'label'}) # type: ignore
df['label'] = df['label'] - 1 # for zero-based indexing

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

## Step 2 - Tokenize the data

In [4]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")

# Define the tokenization function
def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)

# Set the format to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
train_dataset['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

## Step 1 - Train the model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the data
datafile_path = "../data/fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)

# Extract the text column for processing
df = df[['Text']]

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [2]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")

# Define the tokenization function
def tokenize_fn(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

# Load the model with a masked language modeling head
model = AutoModelForMaskedLM.from_pretrained("intfloat/e5-small-v2")

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model_save_path = 'saved_model/fine_tuned_mlm'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at intfloat/e5-small-v2 and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/50 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 9.96, 'learning_rate': 5e-05, 'epoch': 0.2}
{'loss': 7.4739, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.4}
{'loss': 6.8796, 'learning_rate': 2.5e-05, 'epoch': 0.6}
{'loss': 6.7504, 'learning_rate': 1.25e-05, 'epoch': 0.8}
{'loss': 6.7735, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 6.736252307891846, 'eval_runtime': 24.329, 'eval_samples_per_second': 8.221, 'eval_steps_per_second': 0.164, 'epoch': 1.0}
{'train_runtime': 482.5641, 'train_samples_per_second': 1.658, 'train_steps_per_second': 0.104, 'train_loss': 7.5674598693847654, 'epoch': 1.0}


('saved_model/fine_tuned_mlm/tokenizer_config.json',
 'saved_model/fine_tuned_mlm/special_tokens_map.json',
 'saved_model/fine_tuned_mlm/vocab.txt',
 'saved_model/fine_tuned_mlm/added_tokens.json',
 'saved_model/fine_tuned_mlm/tokenizer.json')

## Step 4 - Load the model and tokenizer

In [10]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the tokenizer and model from the saved directory
model_load_path = './saved_model/fine_tuned_mlm'
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
model = AutoModelForMaskedLM.from_pretrained(model_load_path, output_hidden_states=True)

# Ensure the model is in evaluation mode
model.eval()

# Load the data
datafile_path = "../data/fine_food_reviews_with_embeddings_1k.csv"
df = pd.read_csv(datafile_path)

# Extract the text column for processing
df_text = df[['Text']]

# Convert to Hugging Face Dataset objects
dataset = Dataset.from_pandas(df_text)

# Tokenize the dataset
def tokenize_fn(examples):
    return tokenizer(examples['Text'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Set the format to PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Define a function to compute the embeddings
def compute_embeddings(examples):
    with torch.no_grad():
        outputs = model(input_ids=examples['input_ids'], attention_mask=examples['attention_mask'])
        hidden_states = outputs.hidden_states[-1]  # Get last hidden state
        embeddings = hidden_states[:, 0, :]  # Use the CLS token embeddings
    return {'embeddings': embeddings}

# Compute the embeddings
embeddings_dataset = tokenized_dataset.map(compute_embeddings, batched=True, batch_size=16)

# Extract embeddings and convert to numpy arrays
embeddings = [embedding.numpy() for embedding in embeddings_dataset['embeddings']]

# Add embeddings to DataFrame
df['embeddings'] = embeddings



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
df.rename(columns={'embeddings': 'embedding'}, inplace=True)

df_old = pd.read_csv(datafile_path)
df_new = pd.concat([df_old['Score'], df['embedding']], axis=1)
df_new.to_parquet('../data/fine_food_reviews_with_e5_small_ft_clm_embeddings_1k.parquet', index=False)