In [None]:
import torch
from transformers import XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
import nltk
nltk.download('punkt')

# Define your dataset class

def encode_emojis(sent):

    lst = nltk.word_tokenize(sent)
    print(lst)


class MyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=240):
        self.texts = texts
        self.labels = labels
        self.label_encoder = LabelEncoder()
        self.numerical_labels = self.label_encoder.fit_transform(labels)

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.numerical_labels[idx], dtype=torch.long)
        }
        return item

# Load the pre-trained RoBERTa model and tokenizer
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
data = pd.read_excel("Train.xlsx")
#data['Comments']=data['Comments'].map(encode_emojis)
# Define your training dataset and dataloader
train_texts = data['Comments']  # Your training text data
train_labels = data['Sentiment']  # Your training labels
train_dataset = MyDataset(train_texts, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Define optimizer and training parameters
optimizer =torch.optim.AdamW(model.parameters(),lr=2e-6)
num_epochs = 3

# Fine-tuning loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained("xlm-roberta2_model.pt")
tokenizer.save_pretrained("xlm-roberta2_model.pt")