In [1]:
import pandas as pd
from tqdm import trange

## $\S1.$ Loading data frames with vectors before fine-tuning

In [3]:
df_train = pd.read_parquet("SST-5_train_august10.parquet")
df_train

Unnamed: 0,label,text,vector
0,4,The Rock is destined to be the 21st Century 's...,"[0.025059903040528297, -0.0033508273772895336,..."
1,5,The gorgeously elaborate continuation of `` Th...,"[-0.010023828595876694, -0.022591764107346535,..."
2,4,Singer/composer Bryan Adams contributes a slew...,"[0.021090717986226082, 0.01804506592452526, -0..."
3,3,You 'd think by now America would have had eno...,"[0.03466595709323883, -0.028959855437278748, -..."
4,4,Yet the act is still charming here .,"[-0.019601160660386086, -0.04158126562833786, ..."
...,...,...,...
8539,1,A real snooze .,"[0.028580080717802048, 0.020179858431220055, 0..."
8540,2,No surprises .,"[-0.04816627874970436, 0.007212366443127394, 0..."
8541,4,We 've seen the hippie-turned-yuppie plot befo...,"[0.023906568065285683, 0.00044874087325297296,..."
8542,1,Her fans walked out muttering words like `` ho...,"[-0.011076121591031551, -0.011684155091643333,..."


In [4]:
df_val = pd.read_parquet("SST-5_validation_august10.parquet")
df_val

Unnamed: 0,label,text,vector
0,4,It 's a lovely film with lovely performances b...,"[0.030717480927705765, -0.031038176268339157, ..."
1,3,"No one goes unindicted here , which is probabl...","[-0.054776858538389206, -0.013850521296262741,..."
2,4,And if you 're not nearly moved to tears by a ...,"[0.057316429913043976, 0.0021277640480548143, ..."
3,5,"A warm , funny , engaging film .","[-0.009823216125369072, -0.02523845061659813, ..."
4,5,Uses sharp humor and insight into human nature...,"[0.002628966700285673, -0.012152031995356083, ..."
...,...,...,...
1096,2,it seems to me the film is about the art of ri...,"[-0.023168005049228668, 0.0233288686722517, -0..."
1097,2,It 's just disappointingly superficial -- a mo...,"[0.003064792137593031, 0.024236295372247696, 0..."
1098,2,The title not only describes its main characte...,"[-0.056361518800258636, -0.00899408757686615, ..."
1099,3,Sometimes it feels as if it might have been ma...,"[0.010815807618200779, -0.02518828772008419, 0..."


In [6]:
df_test = pd.read_parquet("SST-5_test_august10.parquet")
df_test

Unnamed: 0,label,text,vector
0,3,Effective but too-tepid biopic,"[0.04016323760151863, -0.04099227488040924, -0..."
1,4,If you sometimes like to go to the movies to h...,"[0.004230129066854715, 0.02602524869143963, -0..."
2,5,"Emerges as something rare , an issue movie tha...","[-0.013230101205408573, -0.018727866932749748,..."
3,3,The film provides some great insight into the ...,"[0.019872618839144707, -0.0012640012428164482,..."
4,5,Offers that rare combination of entertainment ...,"[0.024984797462821007, 0.013797130435705185, -..."
...,...,...,...
2205,4,An imaginative comedy/thriller .,"[-0.020144404843449593, -0.023559458553791046,..."
2206,5,"( A ) rare , beautiful film .","[0.029272060841321945, 0.024908604100346565, -..."
2207,5,( An ) hilarious romantic comedy .,"[-0.00450526038184762, -0.04325564205646515, -..."
2208,4,Never ( sinks ) into exploitation .,"[-0.011091343127191067, 0.0322980172932148, -0..."


## $\S2.$ Fine-tuning process

In [7]:
from datasets import Dataset
from datasets import concatenate_datasets # https://huggingface.co/docs/datasets/

In [8]:
# texts[0] contains label 1
# ...
# texts[4] contains label 5

texts = []

for i in range(5):
    texts.append( df_train[df_train['label'] == i+1]['text'].tolist() )

In [9]:
for i in range(5):
    print(len(texts[i]))

1092
2218
1624
2322
1288


### 2.1. July 30: Telling the model to separate 1-star ratings and 5-star ratings apart

In [8]:
# creating anchor-positive dataset for rating 5

len_5 = len(texts[4]) // 2 # 644

train_dataset_5_first = Dataset.from_dict({
    'anchor': texts[4][0:len_5],
    'positive': texts[4][len_5:],
    'negative': texts[0][0:len_5]
})

In [9]:
train_dataset_5_second = Dataset.from_dict({
    'anchor': texts[4][0:len_5],
    'positive': texts[4][len_5:],
    'negative': texts[0][-len_5:]
})

In [10]:
train_dataset_5_second

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 644
})

In [11]:
# creating anchor-positive dataset for rating 1

len_1 = len(texts[0]) // 2 # 546

train_dataset_1_first = Dataset.from_dict({
    'anchor': texts[0][0:len_1],
    'positive': texts[0][len_1:],
    'negative': texts[4][0:len_1]
})

In [12]:
train_dataset_1_second = Dataset.from_dict({
    'anchor': texts[0][0:len_1],
    'positive': texts[0][len_1:],
    'negative': texts[4][-len_1:]
})

In [13]:
train_dataset_1_second

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 546
})

### 2.2. August 10: Telling the model to separate 3-star ratings from extreme ratings: 1-star ratings and 5-star ratings

In [8]:
# creating anchor-positive dataset for rating 5

len_5 = len(texts[4]) // 2 # 644

train_dataset_5_from_3_first = Dataset.from_dict({
    'anchor': texts[4][0:len_5],
    'positive': texts[4][len_5:],
    'negative': texts[2][0:len_5] # index 2 means 3-star ratings
})

In [9]:
train_dataset_5_from_3_second = Dataset.from_dict({
    'anchor': texts[4][0:len_5],
    'positive': texts[4][len_5:],
    'negative': texts[2][-len_5:]
})

In [10]:
# creating anchor-positive dataset for rating 1

len_1 = len(texts[0]) // 2 # 546

train_dataset_1_from_3_first = Dataset.from_dict({
    'anchor': texts[0][0:len_1],
    'positive': texts[0][len_1:],
    'negative': texts[2][0:len_1]
})

In [11]:
train_dataset_1_from_3_second = Dataset.from_dict({
    'anchor': texts[0][0:len_1],
    'positive': texts[0][len_1:],
    'negative': texts[2][-len_1:]
})

In [13]:
# Concatenate datasets -- https://huggingface.co/docs/datasets/v1.3.0/processing.html

train_dataset = concatenate_datasets(
    [
        train_dataset_5_from_3_first,
        train_dataset_5_from_3_second,
        train_dataset_1_from_3_first,
        train_dataset_1_from_3_second
    ])

### 2.3. August 11: Telling the model to separate 3-star ratings from extreme ratings: 2-star ratings and 4-star ratings

In [10]:
# creating anchor-positive dataset for rating 4

len_4 = len(texts[3]) // 2 # index 4 means 4-star ratings

train_dataset_4_from_3_first = Dataset.from_dict({
    'anchor': texts[3][0:len_4],
    'positive': texts[3][len_4:],
    'negative': texts[2][0:len_4] # index 2 means 3-star ratings
})

In [12]:
train_dataset_4_from_3_second = Dataset.from_dict({
    'anchor': texts[3][0:len_4],
    'positive': texts[3][len_4:],
    'negative': texts[2][-len_4:]
})

In [14]:
# creating anchor-positive dataset for rating 2

len_2 = len(texts[1]) // 2 #  index 1 means 2-star ratings

train_dataset_2_from_3_first = Dataset.from_dict({
    'anchor': texts[1][:len_2],
    'positive': texts[1][len_2:],
    'negative': texts[2][:len_2] # index 2 means 3-star ratings
})

In [15]:
train_dataset_2_from_3_second = Dataset.from_dict({
    'anchor': texts[1][:len_2],
    'positive': texts[1][len_2:],
    'negative': texts[2][-len_2:]
})

In [16]:
# Concatenate datasets -- https://huggingface.co/docs/datasets/v1.3.0/processing.html

train_dataset = concatenate_datasets(
    [
        train_dataset_4_from_3_first,
        train_dataset_4_from_3_second,
        train_dataset_2_from_3_first,
        train_dataset_2_from_3_second
    ])

## $\S3.$ Fine-tuning

In [17]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses
from sentence_transformers.training_args import BatchSamplers

# https://www.sbert.net/docs/sentence_transformer/training_overview.html

In [18]:
model = SentenceTransformer('fine_tuned_gte_august10')

In [19]:
model._first_module()

Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 

In [20]:
auto_model = model._first_module().auto_model
auto_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [21]:
auto_model.pooler

BertPooler(
  (dense): Linear(in_features=1024, out_features=1024, bias=True)
  (activation): Tanh()
)

In [22]:
auto_model.named_parameters

<bound method Module.named_parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
 

In [23]:
for name, param in auto_model.named_parameters():
    print(name)
    param.requires_grad = False

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [24]:
for name, param in auto_model.named_parameters():
    for i in range(20, 24):
        if name == f'encoder.layer.{i}.output.dense.weight':
            param.requires_grad = True
        elif name == f'encoder.layer.{i}.output.dense.bias':
            param.requires_grad = True
        elif name == f'encoder.layer.{i}.intermediate.dense.weight':
            param.requires_grad = True
        elif name == f'encoder.layer.{i}.intermediate.dense.bias':
            param.requires_grad = True

In [25]:
for name, param in auto_model.named_parameters():
    print(name, ": ",param.requires_grad)

embeddings.word_embeddings.weight :  False
embeddings.position_embeddings.weight :  False
embeddings.token_type_embeddings.weight :  False
embeddings.LayerNorm.weight :  False
embeddings.LayerNorm.bias :  False
encoder.layer.0.attention.self.query.weight :  False
encoder.layer.0.attention.self.query.bias :  False
encoder.layer.0.attention.self.key.weight :  False
encoder.layer.0.attention.self.key.bias :  False
encoder.layer.0.attention.self.value.weight :  False
encoder.layer.0.attention.self.value.bias :  False
encoder.layer.0.attention.output.dense.weight :  False
encoder.layer.0.attention.output.dense.bias :  False
encoder.layer.0.attention.output.LayerNorm.weight :  False
encoder.layer.0.attention.output.LayerNorm.bias :  False
encoder.layer.0.intermediate.dense.weight :  False
encoder.layer.0.intermediate.dense.bias :  False
encoder.layer.0.output.dense.weight :  False
encoder.layer.0.output.dense.bias :  False
encoder.layer.0.output.LayerNorm.weight :  False
encoder.layer.0.outp

In [26]:
loss = losses.MultipleNegativesRankingLoss(model)

In [27]:
# https://www.sbert.net/docs/package_reference/sentence_transformer/training_args.html#sentence_transformers.training_args.SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/fine_tuned_gte",

    # Optional training parameters:
    num_train_epochs=3, # default 3
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5, # default 5e-5
    warmup_ratio=0.1, # Ratio of total training steps used for a linear warmup from 0 to learning_rate
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
)

In [28]:
trainer = SentenceTransformerTrainer(
    model = model,
    train_dataset=train_dataset,
    loss=loss,
    args=args
)

In [29]:
trainer.train()

  0%|          | 0/852 [00:00<?, ?it/s]

{'loss': 3.3433, 'grad_norm': 3.371968984603882, 'learning_rate': 9.26892950391645e-06, 'epoch': 1.76}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'train_runtime': 143.8912, 'train_samples_per_second': 94.655, 'train_steps_per_second': 5.921, 'train_loss': 3.226212908964202, 'epoch': 3.0}


TrainOutput(global_step=852, training_loss=3.226212908964202, metrics={'train_runtime': 143.8912, 'train_samples_per_second': 94.655, 'train_steps_per_second': 5.921, 'train_loss': 3.226212908964202, 'epoch': 3.0})

In [30]:
trainer.save_model('./fine_tuned_gte_august11')