In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm 

In [2]:
df = pd.read_csv('faq_dataset.csv')
df.head(2)

Unnamed: 0,idQuestion,question,answer,label
0,shiq1,Do you offer free shipping?,"Yes, we offer free shipping for orders above a...",shipping
1,shiq2,How long does shipping take?,"Standard shipping takes 5-7 business days, whi...",shipping


In [13]:
len(df)

230

## Data Preparation

In [3]:
X = df['question']
y = df['label']

# Encoder les labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(y)
y_encoded = df['label_encoded']

# strati number
k = 5 
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Init tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

folds = []
for train_index, val_index in skf.split(X, y_encoded):
    # Extracting data données for current fold 
    train_texts, val_texts = X.iloc[train_index], X.iloc[val_index]
    train_labels, val_labels = y_encoded.iloc[train_index], y_encoded.iloc[val_index]

    # Tokenization
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

    # Sauvegarder le fold
    folds.append((train_encodings, train_labels, val_encodings, val_labels))

## Data Loader

In [4]:
# Creating a class Dataset 
class FAQDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels


    def __len__(self):
        return len(self.labels)

    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

In [5]:
# eg first fold
train_encodings, train_labels, val_encodings, val_labels = folds[0]

train_dataset = FAQDataset(train_encodings, train_labels)
val_dataset = FAQDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


## setting Training 

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Loading model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


def train_model(train_loader, model, optimizer, device):
    model.train()  # Mode entraînement
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        
        # Préparer les données pour le GPU
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    return avg_train_loss


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def evaluate_model(val_loader, model, device):
    # Mode evaluation
    model.eval()  
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(val_loader, desc="Evaluating"):
        # data preparation for device
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

        total_loss += loss.item()

        # Calculer les prédictions
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

    avg_val_loss = total_loss / len(val_loader)
    accuracy = correct_predictions / total_predictions
    return avg_val_loss, accuracy


In [9]:
# Boucle sur les folds
for fold, (train_encodings, train_labels, val_encodings, val_labels) in enumerate(folds):
    print(f"\nTraining Fold {fold + 1}/{k}...")

    # Préparer les datasets et loaders
    train_dataset = FAQDataset(train_encodings, train_labels)
    val_dataset = FAQDataset(val_encodings, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    # Entraîner le modèle
    train_loss = train_model(train_loader, model, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}")

    # Évaluer le modèle
    val_loss, val_accuracy = evaluate_model(val_loader, model, device)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")



Training Fold 1/5...


Training: 100%|█████████████████████████████████████████████████████████████████| 6/6 [01:53<00:00, 18.96s/it]


Train Loss: 0.5272


Evaluating: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.81it/s]


Validation Loss: 0.3377, Validation Accuracy: 0.9783

Training Fold 2/5...


Training: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:12<00:00,  2.11s/it]


Train Loss: 0.4551


Evaluating: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.88it/s]


Validation Loss: 0.4145, Validation Accuracy: 0.8913

Training Fold 3/5...


Training: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:11<00:00,  1.91s/it]


Train Loss: 0.3847


Evaluating: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.98it/s]


Validation Loss: 0.2789, Validation Accuracy: 0.9130

Training Fold 4/5...


Training: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:11<00:00,  1.92s/it]


Train Loss: 0.3098


Evaluating: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.97it/s]


Validation Loss: 0.2832, Validation Accuracy: 0.9565

Training Fold 5/5...


Training: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:11<00:00,  1.87s/it]


Train Loss: 0.2814


Evaluating: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.87it/s]

Validation Loss: 0.1550, Validation Accuracy: 0.9783





## Cross validation

In [10]:
# folds results
validation_losses = [0.3377, 0.4145, 0.2789, 0.2832, 0.1550]
validation_accuracies = [0.9783, 0.8913, 0.9130, 0.9565, 0.9783]

# Compute metrics
avg_loss = np.mean(validation_losses)
std_loss = np.std(validation_losses)

avg_accuracy = np.mean(validation_accuracies)
std_accuracy = np.std(validation_accuracies)

# displays results
print("Validation Loss:")
print(f"  Moyenne : {avg_loss:.4f}")
print(f"  Écart-type : {std_loss:.4f}")
print("\nValidation Accuracy:")
print(f"  Moyenne : {avg_accuracy:.4f}")
print(f"  Écart-type : {std_accuracy:.4f}")



Validation Loss:
  Moyenne : 0.2939
  Écart-type : 0.0850

Validation Accuracy:
  Moyenne : 0.9435
  Écart-type : 0.0353


In [14]:
# Sauvegarde du modèle
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [None]:
print(model.parameters())