In [1]:
#!pip install datasets transformers
#!pip install torch
#!pip install transformers

In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset, random_split

* ## Préparer les données

In [3]:
from datasets import load_dataset

# Charger le dataset
dataset = load_dataset("Tirendaz/fifa-world-cup-2022-tweets")
tweets = dataset['train']['Tweet']
labels = dataset['train']['Sentiment']
tweets

['What are we drinking today @TucanTribe \n@MadBears_ \n@lkinc_algo \n@al_goanna \n\n#WorldCup2022 https://t.co/Oga3TzvG5h',
 'Amazing @CanadaSoccerEN  #WorldCup2022 launch video. Shows how much the face of Canada and our men’s national team have changed since our last World Cup entry in 1986.  Can’t wait to see these boys in action!\n\nThis is Canada: FIFA World Cup Opening Video https://t.co/7g73vvwtg8',
 'Worth reading while watching #WorldCup2022 https://t.co/1SQrNa2dYU',
 'Golden Maknae shinning bright\n\nhttps://t.co/4AyZbzGTX4\n#JeonJungkook #Jungkook #전정국 #정국 #JK #GoldenMaknae #bunny #Kookie #Jungshook #BTS #방탄소년단 #WorldCup2022 #FIFAKOOK \n@BTS_twt',
 'If the BBC cares so much about human rights, homosexual rights, and women rights then why not say these before the opening ceremony?? Why are they saying these during the opening ceremony?? Why did the BBC censor the #WorldCup2022 opening ceremony?? https://t.co/f72P03ZN2k',
 'And like, will the mexican fans be able to scream "PU

In [4]:
# Vérifier et convertir les labels en entiers
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
labels = [label_map[label] if isinstance(label, str) else label for label in labels]

# Tokenisation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_len = 128
tokens = tokenizer(tweets, padding='max_length', max_length=max_len, truncation=True, return_tensors='pt')

input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

# Convertir les labels en tenseurs
labels = torch.tensor(labels, dtype=torch.long)  # Convertir les labels en type long

# Créer le dataset Tensor
dataset_tensor = TensorDataset(input_ids, attention_mask, labels)

# Diviser les données en ensembles d'entraînement et de validation
train_size = int(0.8 * len(dataset_tensor))
val_size = len(dataset_tensor) - train_size

train_dataset, val_dataset = random_split(dataset_tensor, [train_size, val_size])

# Créer les DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

* ## Définir le modèle RNN

In [5]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, n_layers, drop_prob=0.5):
        super(SentimentRNN, self).__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        # Dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # Linear layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        # Embedding and LSTM
        x = self.embedding(x)
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Stack up LSTM outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # Reshape to be batch_size first
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1]  # get last batch of labels
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # Initialize hidden state
        weight = next(self.parameters()).data
        
        if (torch.cuda.is_available()):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden


* ## Entraîner le modèle

In [6]:
import torch.optim as optim

# Définir les hyperparamètres
vocab_size = len(tokenizer.vocab)
output_size = 3  # Pour trois classes : négatif, neutre, positif
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = SentimentRNN(vocab_size, embedding_dim, hidden_dim, output_size, n_layers)
if torch.cuda.is_available():
    model.cuda()

# Définir la perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Fonction pour calculer l'exactitude
def accuracy(preds, y):
    _, preds = torch.max(preds, 1)
    correct = (preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

# Entraîner le modèle
epochs = 10
for epoch in range(epochs):
    model.train()
    
    for inputs, masks, labels in train_loader:
        batch_size = inputs.size(0)
        h = model.init_hidden(batch_size)
        
        if torch.cuda.is_available():
            inputs, masks, labels = inputs.cuda(), masks.cuda(), labels.cuda()
        
        h = tuple([e.data for e in h])
        model.zero_grad()
        
        output, h = model(inputs, h)
        loss = criterion(output, labels)
        acc = accuracy(output, labels)
        
        loss.backward()
        optimizer.step()
        
        print(f'Epoch: {epoch+1}/{epochs} | Loss: {loss.item()} | Accuracy: {acc.item()}')
    
    model.eval()
    val_h = model.init_hidden(32)
    val_losses = []
    val_accuracies = []
    
    for inputs, masks, labels in val_loader:
        batch_size = inputs.size(0)
        val_h = model.init_hidden(batch_size)
        
        if torch.cuda.is_available():
            inputs, masks, labels = inputs.cuda(), masks.cuda(), labels.cuda()
        
        val_h = tuple([each.data for each in val_h])
        output, val_h = model(inputs, val_h)
        val_loss = criterion(output, labels)
        val_acc = accuracy(output, labels)
        
        val_losses.append(val_loss.item())
        val_accuracies.append(val_acc.item())
    
    print(f'Validation Loss: {sum(val_losses)/len(val_losses)} | Validation Accuracy: {sum(val_accuracies)/len(val_accuracies)}')


Epoch: 1/10 | Loss: 1.1186219453811646 | Accuracy: 0.21875
Epoch: 1/10 | Loss: 1.1169652938842773 | Accuracy: 0.375
Epoch: 1/10 | Loss: 1.0712236166000366 | Accuracy: 0.375
Epoch: 1/10 | Loss: 1.1042346954345703 | Accuracy: 0.40625
Epoch: 1/10 | Loss: 1.0843594074249268 | Accuracy: 0.46875
Epoch: 1/10 | Loss: 1.1669771671295166 | Accuracy: 0.25
Epoch: 1/10 | Loss: 1.0472966432571411 | Accuracy: 0.5625
Epoch: 1/10 | Loss: 1.0902689695358276 | Accuracy: 0.375
Epoch: 1/10 | Loss: 1.0666286945343018 | Accuracy: 0.4375
Epoch: 1/10 | Loss: 1.1374958753585815 | Accuracy: 0.28125
Epoch: 1/10 | Loss: 1.0791348218917847 | Accuracy: 0.4375
Epoch: 1/10 | Loss: 1.0615944862365723 | Accuracy: 0.4375
Epoch: 1/10 | Loss: 1.1396046876907349 | Accuracy: 0.28125
Epoch: 1/10 | Loss: 1.0993932485580444 | Accuracy: 0.40625
Epoch: 1/10 | Loss: 1.0739153623580933 | Accuracy: 0.375
Epoch: 1/10 | Loss: 1.064639687538147 | Accuracy: 0.5
Epoch: 1/10 | Loss: 1.1303184032440186 | Accuracy: 0.28125
Epoch: 1/10 | Los

* ## Évaluation finale

In [8]:
test_h = model.init_hidden(32)
test_losses = []
test_accuracies = []

for inputs, masks, labels in val_loader:  # Remplacer par le DataLoader de test si disponible
    batch_size = inputs.size(0)
    test_h = model.init_hidden(batch_size)
    
    if torch.cuda.is_available():
        inputs, masks, labels = inputs.cuda(), masks.cuda(), labels.cuda()
    
    test_h = tuple([each.data for each in test_h])
    output, test_h = model(inputs, test_h)
    test_loss = criterion(output, labels)
    test_acc = accuracy(output, labels)
    
    test_losses.append(test_loss.item())
    test_accuracies.append(test_acc.item())

print(f'Test Loss: {sum(test_losses)/len(test_losses)} | Test Accuracy: {sum(test_accuracies)/len(test_accuracies)}')


Test Loss: 0.8510124848244038 | Test Accuracy: 0.6147163121412832
