In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import unicodedata
import string
import re
import spacy
from datasets import load_from_disk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.utils import resample
import numpy as np
import time
import utils

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kadem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kadem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Charger le dataset

### Prétraitement des données

### Tokenisation et préparation des tenseurs

###  Vectorisation avec TF-IDF et entraînement du SVM 

# TESTS

In [3]:
wikiqa_data = load_from_disk("wikiqa")
test_data_set = wikiqa_data["test"]
train_data_set = wikiqa_data["train"]
validation_data_set = wikiqa_data["validation"]


def preprocess_examples(examples):
    examples['question'] = [utils.preprocess(q) for q in examples['question']]
    examples['answer'] = [utils.preprocess(a) for a in examples['answer']]
    return examples

train_data_set = train_data_set.map(preprocess_examples, batched=True)
validation_data_set = validation_data_set.map(preprocess_examples, batched=True)
test_data_set = test_data_set.map(preprocess_examples, batched=True)

# Convertir en DataFrame
train_df = pd.DataFrame({
    'question': train_data_set['question'],
    'answer': train_data_set['answer'],
    'label': train_data_set['label']
})

validation_df = pd.DataFrame({
    'question': validation_data_set['question'],
    'answer': validation_data_set['answer'],
    'label': validation_data_set['label']
})

test_df = pd.DataFrame({
    'question': test_data_set['question'],
    'answer': test_data_set['answer'],
    'label': test_data_set['label']
})

Map: 100%|██████████| 20360/20360 [00:00<00:00, 54290.43 examples/s]
Map: 100%|██████████| 2733/2733 [00:00<00:00, 48391.32 examples/s]
Map: 100%|██████████| 6165/6165 [00:00<00:00, 56713.85 examples/s]


In [4]:
# Sur-échantillonner la classe minoritaire
def balance_classes(df):
    df_majority = df[df.label == 0]
    df_minority = df[df.label == 1]
    
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # échantillonner avec remplacement
                                     n_samples=len(df_majority),    # pour faire correspondre la classe majoritaire
                                     random_state=123) # pour la reproductibilité
    
    return pd.concat([df_majority, df_minority_upsampled])

balanced_train_df = balance_classes(train_df)
balanced_validation_df = balance_classes(validation_df)
balanced_test_df = balance_classes(test_df)

# Afficher les statistiques des ensembles de données après suréchantillonnage
print("Ensemble de données d'entraînement :")
print(balanced_train_df['label'].value_counts())
print("\nEnsemble de données de validation :")
print(balanced_validation_df['label'].value_counts())
print("\nEnsemble de données de test :")
print(balanced_test_df['label'].value_counts())

Ensemble de données d'entraînement :
label
0    19320
1    19320
Name: count, dtype: int64

Ensemble de données de validation :
label
0    2593
1    2593
Name: count, dtype: int64

Ensemble de données de test :
label
0    5872
1    5872
Name: count, dtype: int64


In [5]:
# Réduire la classe majoritaire et suréchantillonner la classe minoritaire
def reduce_and_balance_classes(df):
    df_majority = df[df.label == 0]
    df_minority = df[df.label == 1]
    
    # Réduire la classe majoritaire de 75 %
    df_majority_reduced = resample(df_majority, 
                                   replace=False,    # échantillonner sans remplacement
                                   n_samples=int(len(df_majority) * 0.25),  # 25 % de la classe majoritaire
                                   random_state=123) # pour la reproductibilité
    
    # Suréchantillonner la classe minoritaire pour correspondre à la taille de la classe majoritaire réduite
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # échantillonner avec remplacement
                                     n_samples=len(df_majority_reduced),    # pour faire correspondre la classe majoritaire réduite
                                     random_state=123) # pour la reproductibilité
    
    return pd.concat([df_majority_reduced, df_minority_upsampled])

balanced_train_df = reduce_and_balance_classes(train_df)
balanced_validation_df = reduce_and_balance_classes(validation_df)
balanced_test_df = reduce_and_balance_classes(test_df)

# Afficher les statistiques des ensembles de données après réduction et suréchantillonnage
print("Ensemble de données d'entraînement :")
print(balanced_train_df['label'].value_counts())
print("\nEnsemble de données de validation :")
print(balanced_validation_df['label'].value_counts())
print("\nEnsemble de données de test :")
print(balanced_test_df['label'].value_counts())

Ensemble de données d'entraînement :
label
0    4830
1    4830
Name: count, dtype: int64

Ensemble de données de validation :
label
0    648
1    648
Name: count, dtype: int64

Ensemble de données de test :
label
0    1468
1    1468
Name: count, dtype: int64


In [6]:
# Définir le pipeline de transformation des colonnes
preprocess_pipeline = ColumnTransformer(
    transformers=[
        ('question', TfidfVectorizer(), 'question'),
        ('answer', TfidfVectorizer(), 'answer'),
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocess_pipeline),
    ('classifier', SVC(kernel='linear', class_weight='balanced', probability=True))
])

# Entraîner le modèle avec les données équilibrées
start_time = time.time()
pipeline.fit(balanced_train_df[['question', 'answer']], balanced_train_df['label'])
end_time = time.time()
actual_training_time = end_time - start_time
print(f"Temps d'entraînement réel: {actual_training_time:.2f} secondes")

# Fonction pour calculer MAP, MRR, S@1
def compute_metrics(df, predictions, probabilities):
    grouped = df.groupby('question')
    ap_sum = 0
    rr_sum = 0
    success_at_1 = 0
    for name, group in grouped:
        group_probs = probabilities[group.index, 1]  # Probabilités de la classe 1
        group_labels = group['label'].values
        sorted_indices = np.argsort(-group_probs)  # Trier les indices par probabilité décroissante
        sorted_labels = group_labels[sorted_indices]
        
        ap = 0
        correct_count = 0
        for rank, label in enumerate(sorted_labels, 1):
            if label == 1:
                correct_count += 1
                ap += correct_count / rank
                if correct_count == 1:
                    rr_sum += 1 / rank
        ap /= max(correct_count, 1)
        ap_sum += ap
        
        if sorted_labels[0] == 1:
            success_at_1 += 1
    
    n = len(grouped)
    map_score = ap_sum / n
    mrr_score = rr_sum / n
    success_at_1_score = success_at_1 / n
    
    return map_score, mrr_score, success_at_1_score

: 

In [4]:
# Évaluer le modèle sur les données de validation
validation_predictions = pipeline.predict(balanced_validation_df[['question', 'answer']])
validation_probabilities = pipeline.predict_proba(balanced_validation_df[['question', 'answer']])
map_score, mrr_score, success_at_1_score = compute_metrics(balanced_validation_df, validation_predictions, validation_probabilities)
print(f"\nValidation Results:\nMAP: {map_score}\nMRR: {mrr_score}\nS@1: {success_at_1_score}\n")

# Calculer les métriques classiques
validation_accuracy = accuracy_score(balanced_validation_df['label'], validation_predictions)
validation_precision = precision_score(balanced_validation_df['label'], validation_predictions)
validation_recall = recall_score(balanced_validation_df['label'], validation_predictions)
validation_f1 = f1_score(balanced_validation_df['label'], validation_predictions)

print(f"Accuracy: {validation_accuracy}")
print(f"Precision: {validation_precision}")
print(f"Recall: {validation_recall}")
print(f"F1 Score: {validation_f1}")

# Évaluer le modèle sur les données de test
test_predictions = pipeline.predict(balanced_test_df[['question', 'answer']])
test_probabilities = pipeline.predict_proba(balanced_test_df[['question', 'answer']])
map_score, mrr_score, success_at_1_score = compute_metrics(balanced_test_df, test_predictions, test_probabilities)
print(f"\nTest Results:\nMAP: {map_score}\nMRR: {mrr_score}\nS@1: {success_at_1_score}\n")

# Calculer les métriques classiques
test_accuracy = accuracy_score(balanced_test_df['label'], test_predictions)
test_precision = precision_score(balanced_test_df['label'], test_predictions)
test_recall = recall_score(balanced_test_df['label'], test_predictions)
test_f1 = f1_score(balanced_test_df['label'], test_predictions)

print(f"Accuracy: {test_accuracy}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")
print(f"F1 Score: {test_f1}")

NameError: name 'pipeline' is not defined