# Fine-tuning with SST-5

In [None]:
%pip install datasets

### Loading SST-5 text data

In [2]:
import pandas as pd
import numpy as np

In [5]:
def load_text_data(data_type):
  df = pd.read_parquet(f"SST-5_{data_type}.parquet")
  df.drop(columns=["vectors"], inplace=True)
  df.rename(columns={"truth": "rating"}, inplace=True)
  return df

In [6]:
df_train = load_text_data("train")
df_validation = load_text_data("validation")
df_test = load_text_data("test")

### Forming the data to feed into fine-tuning process

In [7]:
# https://huggingface.co/docs/datasets/
from datasets import Dataset

In [8]:
def create_training_triples(comparison_list, random_state=123):
  triples = {"anchor": [], "positive": [], "negative": []}
  for rating1, rating2 in comparison_list:
    df1 = df_train[df_train["rating"] == rating1].sample(frac=1, random_state=random_state)
    df2 = df_train[df_train["rating"] == rating2].sample(frac=1, random_state=random_state)

    split1 = min(len(df1), len(df2)) // 3
    split2 = 2*split1
    split3 = 3*split1

    triples["anchor"] += df1["text"][:split1].tolist()
    triples["positive"] += df1["text"][split1:split2].tolist()
    triples["negative"] += df2["text"][:split1].tolist()

    triples["anchor"] += df2["text"][split1:split2].tolist()
    triples["positive"] += df2["text"][split2:split3].tolist()
    triples["negative"] += df1["text"][split2:split3].tolist()

  return Dataset.from_dict(triples)

In [9]:
triples = create_training_triples([(5, 1), (4, 2)])

### Fine-tuning

In [None]:
%pip install sentence-transformers

In [None]:
%pip install flash_attn

In [12]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses
from sentence_transformers.training_args import BatchSamplers

# https://www.sbert.net/docs/sentence_transformer/training_overview.html
from datetime import datetime

In [13]:
model_path = "infgrad/stella_en_1.5B_v5"
model_name = "stella_en_1.5B_v5"

In [34]:
model = SentenceTransformer(model_path, trust_remote_code=True).cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [35]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: Qwen2Model 
  (1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1536, 'out_features': 1024, 'bias': True, 'activation_function': 'torch.nn.modules.linear.Identity'})
)


In [37]:
# Freeze Transformer Submodule
auto_model = model._first_module().auto_model
for param in auto_model.parameters():
    param.requires_grad = False

In [38]:
loss = losses.MultipleNegativesRankingLoss(model)

In [39]:
# https://www.sbert.net/docs/package_reference/sentence_transformer/training_args.html#sentence_transformers.training_args.SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"models/embeddings/{model_name}",

    # Optional training parameters:
    num_train_epochs=3, # default 3
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5, # default 5e-5
    warmup_ratio=0.1, # Ratio of total training steps used for a linear warmup from 0 to learning_rate
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
)

In [40]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=triples,
    loss=loss,
    args=args,
)

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
trainer.train()

In [None]:
save_model_path = f'./models/embeddings/{model_name}_{timestamp}'
trainer.save_model(save_model_path)

print(save_model_path)

## Save Fine-tuned Embeddings

In [23]:
def save_embeddings(model, name, timestamp, df, df_type):
  embeddings = model.encode(df["text"])
  np.save(f"./embeddings/{name}_{timestamp}_{df_type}.npy", embeddings)

In [24]:
save_embeddings(model, model_name, timestamp, df_train, "train")
save_embeddings(model, model_name, timestamp, df_validation, "validation")
save_embeddings(model, model_name, timestamp, df_test, "test")