In [None]:
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch.optim as optim
import random
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Reproductibilité
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

# Chargement du fichier CSV
df = pd.read_csv('cleaned_dataset.csv')
print("Colonnes disponibles :", df.columns)

# Tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Prétraitement des données
def preprocess_data(df, vocab, tokenizer):
    # Tokeniser chaque texte et construire les indices de vocabulaire
    texts = [torch.tensor(vocab(tokenizer(text)), dtype=torch.long) for text in df['text']]
    labels = torch.tensor(df['target'].values, dtype=torch.float)  # Utilisation de 'target' comme label
    return texts, labels

# Construction du vocabulaire
def build_vocab_from_df(df, tokenizer):
    # Tokeniser chaque texte et construire le vocabulaire
    tokenizer_gen = (tokenizer(text) for text in df['text'])
    vocab = build_vocab_from_iterator(tokenizer_gen, specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab

# Construction du vocabulaire
vocab = build_vocab_from_df(df, tokenizer)

# Prétraitement des données
texts, labels = preprocess_data(df, vocab, tokenizer)

# Affichage de la taille du vocabulaire et de quelques exemples de texte tokenisés
print(f"Taille du vocabulaire: {len(vocab)}")
print(f"Exemple de texte tokenisé : {texts[0]}")

# Split en train, validation, test
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=SEED)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.25, random_state=SEED)

# Padding des séquences de texte (en fonction de la longueur maximale)
def pad_sequence(texts, max_len):
    return [torch.cat([text, torch.zeros(max_len - len(text))]) if len(text) < max_len else text[:max_len] for text in texts]

# Définir la longueur maximale des séquences
MAX_LEN = max([len(text) for text in train_texts])  # Utiliser la longueur maximale des textes d'entraînement

# Padding des séquences
train_texts = pad_sequence(train_texts, MAX_LEN)
valid_texts = pad_sequence(valid_texts, MAX_LEN)
test_texts = pad_sequence(test_texts, MAX_LEN)l

# Création des DataLoader
train_data = TensorDataset(torch.stack(train_texts), train_labels)
valid_data = TensorDataset(torch.stack(valid_texts), valid_labels)
test_data = TensorDataset(torch.stack(test_texts), test_labels)

BATCH_SIZE = 64

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

# Modèle RNN
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.long()  # Ajoutez cette ligne pour convertir en LongTensor
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths=[len(t) for t in text], batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) if self.rnn.bidirectional else hidden[-1,:,:]
        return self.fc(hidden)

# Hyperparamètres
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 3  # Nombre de couches augmentées
BIDIRECTIONAL = True
DROPOUT = 0.5
LEARNING_RATE = 0.001

# Instanciation du modèle
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Optimiseur et fonction de perte
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)  # Ajustement du taux d'apprentissage
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Early Stopping
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Fonction d'entraînement
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, label = batch
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Fonction d'évaluation
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, label = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Fonction pour évaluer l'accuracy
def accuracy(model, iterator):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in iterator:
            text, label = batch
            predictions = model(text).squeeze(1)
            preds = torch.round(torch.sigmoid(predictions))
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())
    return accuracy_score(all_labels, all_preds)

# Entraînement du modèle avec Early Stopping
N_EPOCHS = 10  # Augmentation du nombre d'époques
early_stopping = EarlyStopping(patience=5)

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, valid_loader, criterion)
    valid_acc = accuracy(model, valid_loader)

    print(f"Epoch: {epoch+1}")
    print(f"\tTrain Loss: {train_loss:.3f}")
    print(f"\tVal. Loss: {valid_loss:.3f}")
    print(f"\tVal. Accuracy: {valid_acc*100:.2f}%")

    # Vérification de l'early stopping
    early_stopping(valid_loss, model)
    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

# Charger le meilleur modèle
model.load_state_dict(torch.load('best_model.pt'))



Colonnes disponibles : Index(['target', 'text'], dtype='object')
Taille du vocabulaire: 62349
Exemple de texte tokenisé : tensor([ 527,   16,   41,    4, 1329,    7, 3977,   44, 1427, 5804,   13, 1872,
          33,    2,   20,    9,  151])
Epoch: 1
	Train Loss: 0.623
	Val. Loss: 0.553
	Val. Accuracy: 72.15%
Epoch: 2
	Train Loss: 0.545
	Val. Loss: 0.504
	Val. Accuracy: 75.37%
Epoch: 3
	Train Loss: 0.506
	Val. Loss: 0.492
	Val. Accuracy: 76.72%
Epoch: 4
	Train Loss: 0.476
	Val. Loss: 0.477
	Val. Accuracy: 77.18%
Epoch: 5
	Train Loss: 0.454
	Val. Loss: 0.477
	Val. Accuracy: 77.81%
Epoch: 6
	Train Loss: 0.434
	Val. Loss: 0.476
	Val. Accuracy: 78.25%
Epoch: 7
	Train Loss: 0.419
	Val. Loss: 0.477
	Val. Accuracy: 77.82%
Epoch: 8
	Train Loss: 0.402
	Val. Loss: 0.483
	Val. Accuracy: 78.34%
Epoch: 9
	Train Loss: 0.389
	Val. Loss: 0.482
	Val. Accuracy: 78.50%
Epoch: 10
	Train Loss: 0.376
	Val. Loss: 0.477
	Val. Accuracy: 78.64%


<All keys matched successfully>

In [None]:
# Évaluation sur le jeu de test
test_loss = evaluate(model, test_loader, criterion)
test_acc = accuracy(model, test_loader)

print(f"Test Loss: {test_loss:.3f}")
print(f"Test Accuracy: {test_acc*100:.2f}%")

Test Loss: 0.466
Test Accuracy: 78.98%


In [None]:
sentence = "Fuming with anger after a heated argument."
sentiment, prob = predict_sentiment(model, sentence, vocab, tokenizer, MAX_LEN)

print(f"Phrase: {sentence}")
print(f"Sentiment: {sentiment} (Probabilité: {prob:.2f})")


Phrase: Fuming with anger after a heated argument.
Sentiment: Négatif (Probabilité: 0.52)
