# Fine tune BERT with my own dataset

[Tutorial](https://huggingface.co/blog/how-to-train-sentence-transformers)

[Posible data 1](https://www.argentina.gob.ar/desarrollosocial/entrevistasyopinion)

[Posible data 2](https://www.reddit.com/r/RepublicaArgentina/)

[Posible data 3](https://www.reddit.com/r/Republica_Argentina/)

# Training Data

### Original Documents

In [5]:
path_to_MyModule = '..'

import sys
sys.path.insert(0, path_to_MyModule) 

import pandas as pd
import matplotlib.pyplot as plt
from time import sleep

from MyModule.GeneralFunctions import *
from MyModule.SummarizationFunctions import *
from MyModule.SummarizationFunctions import MostRepresentativeDocs
from MyModule.SamplingFunctions import *
from MyModule.SemanticSimilarityFunctions import *
from MyModule.TopicModelingFunctinos import *
from MyModule.FineTuningFunctions import *

In [6]:
df = pd.read_excel('..\datos.xlsx')[['ID','texto','desafio']]

In [7]:
# Limpieza
df.drop_duplicates(subset='texto', inplace=True)

# Quitando texto de mas en columna "desafio"
df['desafio'] = df['desafio'].apply(lambda x: re.findall('[0-9]+', x)[0])

# A str
df['texto'] = df['texto'].astype(str)

In [8]:
pp_object = Preprocess(lemma=False, stopwords=False)
documents = df['texto'].values.tolist()
documents = pp_object.preprocess(documents)
documents = [doc for doc in documents if doc != '']

### Reddit Documents

In [None]:
with open('result_pp.txt', 'r') as f:
    documents = f.readlines()

In [None]:
pp_object = Preprocess(lemma=False, stopwords=False)
documents = pp_object.preprocess(documents)

# Load model

In [5]:
model_id = "hiiamsid/sentence_similarity_spanish_es"
my_model = MyFineTunedBert()
my_model.load_model("fine_tuned_bert_1.pt", model_id)

In [31]:
my_model.encode(['Soy de derecha y vos no'])

[array([11.637869  , -0.58177364, -0.912256  , ..., -0.46812254,
        -1.4491892 ,  0.4982587 ], dtype=float32)]

# Train

In [12]:
# Save the fine-tuned model
# torch.save(model.state_dict(), "fine_tuned_bert_1.pt")

In [49]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset

# Load pre-trained BERT model and tokenizer
model_id = "hiiamsid/sentence_similarity_spanish_es"
tokenizer = BertTokenizer.from_pretrained(model_id)
model = BertForMaskedLM.from_pretrained(model_id)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at hiiamsid/sentence_similarity_spanish_es and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
# Define a custom dataset for MLM training
class MLMDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer.tokenize(text)
        if len(tokens) > self.max_len - 2:
            tokens = tokens[:self.max_len - 2]
        input_ids = self.tokenizer.encode(tokens, add_special_tokens=True)
        # Randomly mask out some tokens
        masked_ids = input_ids.copy()
        for i, token_id in enumerate(masked_ids):
            if token_id != self.tokenizer.cls_token_id and token_id != self.tokenizer.sep_token_id:
                if torch.rand(1) < 0.15:
                    masked_ids[i] = self.tokenizer.mask_token_id
        # Create attention masks and segment IDs
        attention_mask = [1] * len(input_ids)
        segment_ids = [0] * len(input_ids)
        return masked_ids, attention_mask, segment_ids, input_ids

    def __len__(self):
        return len(self.texts)

In [51]:
# Define a collate function to pad the sequences to the same length
def collate_fn(batch):
    masked_ids = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(item[0]) for item in batch], batch_first=True, padding_value=0)
    attention_mask = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(item[1]) for item in batch], batch_first=True, padding_value=0)
    segment_ids = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(item[2]) for item in batch], batch_first=True, padding_value=0)
    input_ids = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(item[3]) for item in batch], batch_first=True, padding_value=0)
    return masked_ids, attention_mask, segment_ids, input_ids

In [52]:
# Create a dataset and dataloader for MLM training
dataset = MLMDataset(documents, tokenizer, max_len=512)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Define the optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Train the model for MLM
total_loss = 0.0
model.train()
for epoch in range(20):
    this_running_loss = 0.0
    for i, batch in enumerate(dataloader):
        masked_ids, attention_mask, segment_ids, input_ids = batch
        outputs = model(masked_ids, attention_mask=attention_mask, token_type_ids=segment_ids, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        this_running_loss += loss.item()
        total_loss += this_running_loss
        if i % 5 == 0:
            print('[Epoch %d, Batch %d of %d] this loss: %.3f, total loss: %.3f' % (epoch+1, i+1, len(dataloader), this_running_loss/100, total_loss/(i+1+epoch*len(dataloader))/100))
            this_running_loss = 0.0

[Epoch 1, Batch 1 of 310] this loss: 0.116, total loss: 0.116
[Epoch 1, Batch 6 of 310] this loss: 0.361, total loss: 0.227
[Epoch 1, Batch 11 of 310] this loss: 0.169, total loss: 0.172
[Epoch 1, Batch 16 of 310] this loss: 0.129, total loss: 0.143
[Epoch 1, Batch 21 of 310] this loss: 0.146, total loss: 0.130
[Epoch 1, Batch 26 of 310] this loss: 0.147, total loss: 0.121
[Epoch 1, Batch 31 of 310] this loss: 0.138, total loss: 0.114
[Epoch 1, Batch 36 of 310] this loss: 0.129, total loss: 0.108
[Epoch 1, Batch 41 of 310] this loss: 0.127, total loss: 0.104
[Epoch 1, Batch 46 of 310] this loss: 0.154, total loss: 0.103
[Epoch 1, Batch 51 of 310] this loss: 0.124, total loss: 0.100
[Epoch 1, Batch 56 of 310] this loss: 0.129, total loss: 0.098
[Epoch 1, Batch 61 of 310] this loss: 0.163, total loss: 0.097
[Epoch 1, Batch 66 of 310] this loss: 0.165, total loss: 0.098
[Epoch 1, Batch 71 of 310] this loss: 0.116, total loss: 0.096
[Epoch 1, Batch 76 of 310] this loss: 0.126, total loss: 