### Loading data frames with vectors before fine-tuning

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_parquet("SST-5_train.parquet")
df_train

Unnamed: 0,label,text,vector
0,4,The Rock is destined to be the 21st Century 's...,"[0.028988052159547806, -0.009446061216294765, ..."
1,5,The gorgeously elaborate continuation of `` Th...,"[0.009347192011773586, -0.024268826469779015, ..."
2,4,Singer/composer Bryan Adams contributes a slew...,"[0.008711128495633602, 0.02659982070326805, 0...."
3,3,You 'd think by now America would have had eno...,"[0.02907816506922245, -0.0135031184181571, -0...."
4,4,Yet the act is still charming here .,"[-0.009661787189543247, -0.008504915982484818,..."
...,...,...,...
8539,1,A real snooze .,"[0.02205696515738964, 0.005160005763173103, -0..."
8540,2,No surprises .,"[-0.025823453441262245, 0.01339965220540762, 0..."
8541,4,We 've seen the hippie-turned-yuppie plot befo...,"[0.022835904732346535, 0.005308573134243488, -..."
8542,1,Her fans walked out muttering words like `` ho...,"[0.0018498385325074196, -0.006132118869572878,..."


In [3]:
df_val = pd.read_parquet("SST-5_validation.parquet")
df_val

Unnamed: 0,label,text,vector
0,4,It 's a lovely film with lovely performances b...,"[0.014344158582389355, -0.014246133156120777, ..."
1,3,"No one goes unindicted here , which is probabl...","[-0.033927109092473984, 0.004705137573182583, ..."
2,4,And if you 're not nearly moved to tears by a ...,"[0.03554988279938698, 0.005494742188602686, 0...."
3,5,"A warm , funny , engaging film .","[0.0033094482496380806, -0.006415203213691711,..."
4,5,Uses sharp humor and insight into human nature...,"[-0.00372790964320302, -0.01158247422426939, -..."
...,...,...,...
1096,2,it seems to me the film is about the art of ri...,"[-0.0041084508411586285, 0.0022037506569176912..."
1097,2,It 's just disappointingly superficial -- a mo...,"[0.026826655492186546, -0.003068310907110572, ..."
1098,2,The title not only describes its main characte...,"[-0.01506034005433321, -0.018043246120214462, ..."
1099,3,Sometimes it feels as if it might have been ma...,"[0.01700529269874096, -0.019340755417943, 0.01..."


In [4]:
df_test = pd.read_parquet("SST-5_test.parquet")
df_test

Unnamed: 0,label,text,vector
0,3,Effective but too-tepid biopic,"[0.03771601989865303, -0.01407540775835514, -0..."
1,4,If you sometimes like to go to the movies to h...,"[0.004801097325980663, 0.016335567459464073, -..."
2,5,"Emerges as something rare , an issue movie tha...","[-0.007570621557533741, -0.01461589615792036, ..."
3,3,The film provides some great insight into the ...,"[0.017468415200710297, 0.0014129610499367118, ..."
4,5,Offers that rare combination of entertainment ...,"[-0.0018032152438536286, 0.009503910318017006,..."
...,...,...,...
2205,4,An imaginative comedy/thriller .,"[0.003715773345902562, -0.017833538353443146, ..."
2206,5,"( A ) rare , beautiful film .","[0.014899306930601597, -0.0012953468831256032,..."
2207,5,( An ) hilarious romantic comedy .,"[0.0038455419708043337, -0.02839694730937481, ..."
2208,4,Never ( sinks ) into exploitation .,"[-0.0060380371287465096, 0.018880721181631088,..."


### Forming the data to feed into fine-tuning process

In [33]:
from datasets import Dataset
from datasets import load_dataset # https://huggingface.co/docs/datasets/

In [44]:
# texts[0] contains label 1
# ...
# texts[4] contains label 5

texts = []

for i in range(5):
    texts.append( df_train[df_train['label'] == i+1]['text'].tolist() )

In [45]:
texts[0]

["... a sour little movie at its core ; an exploration of the emptiness that underlay the relentless gaiety of the 1920 's ... The film 's ending has a `` What was it all for ? ''",
 "As it is , it 's too long and unfocused .",
 'A decided lack of spontaneity in its execution and a dearth of real poignancy in its epiphanies .',
 "You 'll forget about it by Monday , though , and if they 're old enough to have developed some taste , so will your kids .",
 "The Ya-Ya 's have many secrets and one is - the books are better .",
 'Wedding feels a bit anachronistic .',
 "The Paradiso 's rusted-out ruin and ultimate collapse during the film 's final ( restored ) third ... emotionally belittle a cinema classic .",
 "Collateral Damage is trash , but it earns extra points by acting as if it were n't .",
 "It would take a complete moron to foul up a screen adaptation of Oscar Wilde 's classic satire .",
 'An uncomfortable movie , suffocating and sometimes almost senseless , The Grey Zone does have 

In [47]:
train_dataset_1 = Dataset.from_dict({
    'anchor': texts[0][0:522],
    'positive': texts[0][522:]
})

ArrowInvalid: Column 1 named positive expected length 522 but got length 570

In [31]:
dataset = load_dataset("parquet", data_files="SST-5_train.parquet")


Generating train split: 0 examples [00:00, ? examples/s]

In [32]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'vector'],
        num_rows: 8544
    })
})

In [12]:
squad_dataset = load_dataset('squad')

In [24]:
for i in range(5):
    print(len(texts[f'{i}']))

1092
2218
1624
2322
1288


In [28]:
processed_dataset = datasets.map(texts['i'])

AttributeError: module 'datasets' has no attribute 'map'

In [10]:
import random

In [11]:
text_samples = []

for i in range(5):
    text_samples.append(random.sample(population = texts[i], k = 100))

In [12]:
for i in range(5):
    print(len(text_samples[i]))

100
100
100
100
100


In [13]:
from itertools import combinations

sentence_pairs = []

for i in range(5):
    list_of_texts = text_samples[i]
    sentence_pairs.append(list(combinations(list_of_texts, 2)))

In [14]:
len(sentence_pairs[0][0])

2

In [15]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

In [16]:
train_sample_list = []

In [17]:
for i in range(5):
    for row in tqdm(sentence_pairs[i]):
        train_sample_list.append(InputExample(
            texts = [row[0], row[1]],
            label = 1 
            # this 1 means max cosine similarity
            # within sentence_pairs[i], they are supposed to be highly similar sentences
        ))

  0%|          | 0/4950 [00:00<?, ?it/s]

  0%|          | 0/4950 [00:00<?, ?it/s]

  0%|          | 0/4950 [00:00<?, ?it/s]

  0%|          | 0/4950 [00:00<?, ?it/s]

  0%|          | 0/4950 [00:00<?, ?it/s]

In [18]:
from torch.utils.data import DataLoader

loader = DataLoader(
    train_sample_list, shuffle = True, batch_size = 32)

### Fine-tuning

In [19]:
from sentence_transformers import models, SentenceTransformer

gte = models.Transformer('thenlper/gte-large') # loading pre-trained model
pooler = models.Pooling(
    gte.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[gte, pooler])

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [20]:
from sentence_transformers import losses

In [21]:
loss = losses.CosineSimilarityLoss(model)

In [22]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives = [(loader, loss)],
    epochs = epochs,
    warmup_steps = warmup_steps,
    output_path = './gte_fine_tuned',
    show_progress_bar = True
)

  0%|          | 0/774 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [30]:
import accelerate

accelerate.__version__

'0.27.2'