### Loading data frames with vectors before fine-tuning

In [16]:
import pandas as pd

In [17]:
df_train = pd.read_parquet("SST-5_train.parquet")
df_train

Unnamed: 0,label,text,vector
0,4,The Rock is destined to be the 21st Century 's...,"[0.028988052159547806, -0.009446061216294765, ..."
1,5,The gorgeously elaborate continuation of `` Th...,"[0.009347192011773586, -0.024268826469779015, ..."
2,4,Singer/composer Bryan Adams contributes a slew...,"[0.008711128495633602, 0.02659982070326805, 0...."
3,3,You 'd think by now America would have had eno...,"[0.02907816506922245, -0.0135031184181571, -0...."
4,4,Yet the act is still charming here .,"[-0.009661787189543247, -0.008504915982484818,..."
...,...,...,...
8539,1,A real snooze .,"[0.02205696515738964, 0.005160005763173103, -0..."
8540,2,No surprises .,"[-0.025823453441262245, 0.01339965220540762, 0..."
8541,4,We 've seen the hippie-turned-yuppie plot befo...,"[0.022835904732346535, 0.005308573134243488, -..."
8542,1,Her fans walked out muttering words like `` ho...,"[0.0018498385325074196, -0.006132118869572878,..."


In [18]:
df_val = pd.read_parquet("SST-5_validation.parquet")
df_val

Unnamed: 0,label,text,vector
0,4,It 's a lovely film with lovely performances b...,"[0.014344158582389355, -0.014246133156120777, ..."
1,3,"No one goes unindicted here , which is probabl...","[-0.033927109092473984, 0.004705137573182583, ..."
2,4,And if you 're not nearly moved to tears by a ...,"[0.03554988279938698, 0.005494742188602686, 0...."
3,5,"A warm , funny , engaging film .","[0.0033094482496380806, -0.006415203213691711,..."
4,5,Uses sharp humor and insight into human nature...,"[-0.00372790964320302, -0.01158247422426939, -..."
...,...,...,...
1096,2,it seems to me the film is about the art of ri...,"[-0.0041084508411586285, 0.0022037506569176912..."
1097,2,It 's just disappointingly superficial -- a mo...,"[0.026826655492186546, -0.003068310907110572, ..."
1098,2,The title not only describes its main characte...,"[-0.01506034005433321, -0.018043246120214462, ..."
1099,3,Sometimes it feels as if it might have been ma...,"[0.01700529269874096, -0.019340755417943, 0.01..."


In [19]:
df_test = pd.read_parquet("SST-5_test.parquet")
df_test

Unnamed: 0,label,text,vector
0,3,Effective but too-tepid biopic,"[0.03771601989865303, -0.01407540775835514, -0..."
1,4,If you sometimes like to go to the movies to h...,"[0.004801097325980663, 0.016335567459464073, -..."
2,5,"Emerges as something rare , an issue movie tha...","[-0.007570621557533741, -0.01461589615792036, ..."
3,3,The film provides some great insight into the ...,"[0.017468415200710297, 0.0014129610499367118, ..."
4,5,Offers that rare combination of entertainment ...,"[-0.0018032152438536286, 0.009503910318017006,..."
...,...,...,...
2205,4,An imaginative comedy/thriller .,"[0.003715773345902562, -0.017833538353443146, ..."
2206,5,"( A ) rare , beautiful film .","[0.014899306930601597, -0.0012953468831256032,..."
2207,5,( An ) hilarious romantic comedy .,"[0.0038455419708043337, -0.02839694730937481, ..."
2208,4,Never ( sinks ) into exploitation .,"[-0.0060380371287465096, 0.018880721181631088,..."


### Forming the data to feed into fine-tuning process

In [28]:
from datasets import Dataset
from datasets import concatenate_datasets # https://huggingface.co/docs/datasets/

In [21]:
# texts[0] contains label 1
# ...
# texts[4] contains label 5

texts = []

for i in range(5):
    texts.append( df_train[df_train['label'] == i+1]['text'].tolist() )

In [22]:
for i in range(5):
    print(len(texts[i]))

1092
2218
1624
2322
1288


In [23]:
# creating anchor-positive dataset for rating 5

len_5 = len(texts[4]) // 2 # 644

train_dataset_5 = Dataset.from_dict({
    'anchor': texts[4][0:len_5],
    'positive': texts[4][len_5:],
    'negative': texts[0][0:len_5]
})

In [24]:
train_dataset_5

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 644
})

In [25]:
# creating anchor-positive dataset for rating 1

len_1 = len(texts[0]) // 2 # 546

train_dataset_1 = Dataset.from_dict({
    'anchor': texts[0][0:len_1],
    'positive': texts[0][len_1:],
    'negative': texts[4][0:len_1]
})

In [26]:
train_dataset_1

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 546
})

In [29]:
# Concatenate datasets -- https://huggingface.co/docs/datasets/v1.3.0/processing.html

train_dataset = concatenate_datasets([train_dataset_1, train_dataset_5])

### Fine-tuning

In [37]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses
from sentence_transformers.training_args import BatchSamplers

# https://www.sbert.net/docs/sentence_transformer/training_overview.html

In [33]:
model = SentenceTransformer('thenlper/gte-large')

In [34]:
loss = losses.MultipleNegativesRankingLoss(model)

In [39]:
# https://www.sbert.net/docs/package_reference/sentence_transformer/training_args.html#sentence_transformers.training_args.SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/fine_tuned_gte",

    # Optional training parameters:
    num_train_epochs=3, # default 3
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5, # default 5e-5
    warmup_ratio=0.1, # Ratio of total training steps used for a linear warmup from 0 to learning_rate
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
)

In [40]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=loss,
    args=args
)


In [41]:
trainer.train()

  0%|          | 0/225 [00:00<?, ?it/s]

{'train_runtime': 807.4902, 'train_samples_per_second': 4.421, 'train_steps_per_second': 0.279, 'train_loss': 2.733545735677083, 'epoch': 3.0}


TrainOutput(global_step=225, training_loss=2.733545735677083, metrics={'train_runtime': 807.4902, 'train_samples_per_second': 4.421, 'train_steps_per_second': 0.279, 'train_loss': 2.733545735677083, 'epoch': 3.0})

In [43]:
trainer.save_model('./fine_tuned_gte_july23')

In [44]:
model_tuned = SentenceTransformer('fine_tuned_gte_july23')

In [45]:
model_tuned

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)