In [32]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import unicodedata
import string
import re
import spacy
from datasets import load_from_disk
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from utils import preprocess
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report



### Chargement des des données et Prétraitement des données

In [37]:
wikiqa_data = load_from_disk("wikiqa")
test_data_set = wikiqa_data["test"]
train_data_set = wikiqa_data["train"]
validation_data_set = wikiqa_data["validation"]


def preprocess_examples(examples):
    examples['question'] = [preprocess(q) for q in examples['question']]
    examples['answer'] = [preprocess(a) for a in examples['answer']]
    return examples

train_data_set = train_data_set.map(preprocess_examples, batched=True)
validation_data_set = validation_data_set.map(preprocess_examples, batched=True)
test_data_set = test_data_set.map(preprocess_examples, batched=True)

# Convertir en DataFrame
train_df = pd.DataFrame({
    'question': train_data_set['question'],
    'answer': train_data_set['answer'],
    'label': train_data_set['label']
})

validation_df = pd.DataFrame({
    'question': validation_data_set['question'],
    'answer': validation_data_set['answer'],
    'label': validation_data_set['label']
})

test_df = pd.DataFrame({
    'question': test_data_set['question'],
    'answer': test_data_set['answer'],
    'label': test_data_set['label']
})

In [38]:
# Afficher les statistiques des ensembles de données après suréchantillonnage
print("Ensemble de données d'entraînement :")
print(train_df['label'].value_counts())
print("\nEnsemble de données de validation :")
print(validation_df['label'].value_counts())
print("\nEnsemble de données de test :")
print(test_df['label'].value_counts())

Ensemble de données d'entraînement :
label
0    19320
1     1040
Name: count, dtype: int64

Ensemble de données de validation :
label
0    2593
1     140
Name: count, dtype: int64

Ensemble de données de test :
label
0    5872
1     293
Name: count, dtype: int64


In [39]:
# Supprimer les questions sans réponse pertinente
def filter_non_relevant(df):
    relevant_questions = df[df['label'] == 1]['question'].unique()
    return df[df['question'].isin(relevant_questions)]

train_df = filter_non_relevant(train_df)
validation_df = filter_non_relevant(validation_df)
test_df = filter_non_relevant(test_df)

In [40]:
# Afficher les statistiques des ensembles de données après suréchantillonnage
print("Ensemble de données d'entraînement :")
print(train_df['label'].value_counts())
print("\nEnsemble de données de validation :")
print(validation_df['label'].value_counts())
print("\nEnsemble de données de test :")
print(test_df['label'].value_counts())

Ensemble de données d'entraînement :
label
0    7645
1    1040
Name: count, dtype: int64

Ensemble de données de validation :
label
0    990
1    140
Name: count, dtype: int64

Ensemble de données de test :
label
0    2058
1     293
Name: count, dtype: int64


In [41]:
# Regrouper les réponses et les labels pour chaque question
def group_answers(df):
    grouped = df.groupby('question').agg(list).reset_index()
    return grouped

train_grouped = group_answers(train_df)
validation_grouped = group_answers(validation_df)
test_grouped = group_answers(test_df)

In [45]:
train_grouped

Unnamed: 0,question,answer,label
0,how a rocket engine works,[rs being tested at nasas stennis space center...,"[0, 0, 0, 1, 0, 0, 0, 0]"
1,how are aircraft radial engines built,"[radial engine timing and cam mechanism, click...","[0, 0, 0, 0, 1, 0, 0]"
2,how are cholera and typhus transmitted and pre...,[cholera is an infection in the small intestin...,"[0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,how are glacier caves formed,[a partly submerged glacier cave on perito mor...,"[0, 0, 0, 1, 0]"
4,how are public schools funded,[state schools also known as public schools or...,"[1, 1]"
...,...,...,...
866,who wrote the song cocaine,[cocaine is a song written and recorded by jj ...,"[1, 0, 0, 0, 0]"
867,who wrote the song feelin alright,[feelin alright also known as feeling alright ...,"[1, 0, 0, 0, 0, 0]"
868,who wrote the song in the mood,[in the mood is a big band era hit recorded by...,"[1, 0, 0]"
869,who wrote whats my name rihanna,[whats my name is a song recorded by barbadian...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


### Conversion des questions et réponses en Word2Vec

In [46]:
import gensim.downloader as api

model = api.load("word2vec-google-news-300")

def embed_sentence(sentence):
    words = sentence.split()
    word_vecs = [model[word] for word in words if word in model]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(model.vector_size)

train_grouped['question_vec'] = train_grouped['question'].apply(embed_sentence)
train_grouped['answer_vecs'] = train_grouped['answer'].apply(lambda answers: [embed_sentence(a) for a in answers])

validation_grouped['question_vec'] = validation_grouped['question'].apply(embed_sentence)
validation_grouped['answer_vecs'] = validation_grouped['answer'].apply(lambda answers: [embed_sentence(a) for a in answers])

test_grouped['question_vec'] = test_grouped['question'].apply(embed_sentence)
test_grouped['answer_vecs'] = test_grouped['answer'].apply(lambda answers: [embed_sentence(a) for a in answers])

### Définition du dataset 

In [25]:
class QARankingDataset(Dataset):
    def __init__(self, df):
        self.questions = np.stack(df['question_vec'].values)
        self.answers = np.stack(df['answer_vec'].values)
        self.labels = df['label'].values
        self.scaler = StandardScaler()
        
        self.features = np.hstack([
            self.questions,
            self.answers
        ])

        self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        question_feature = torch.tensor(self.features[idx][:len(self.features[idx])//2], dtype=torch.float32)
        answer_feature = torch.tensor(self.features[idx][len(self.features[idx])//2:], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return question_feature, answer_feature, label


### Définition du modèle SVM pour le ranking

In [29]:
class RankSVM(nn.Module):
    def __init__(self, input_dim):
        super(RankSVM, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, 256)  # input_dim * 2 car on concatène question et réponse
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, question, answer):
        combined = torch.cat((question, answer), dim=1)
        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        score = self.fc3(x)
        return score

### Entraînement et évaluation du modèle

In [27]:
def train_ranking_model(model, train_loader, validation_loader, criterion, optimizer, num_epochs=10, patience=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    best_loss = float('inf')
    trigger_times = 0
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for question, answer, labels in train_loader:
            question, answer, labels = question.to(device), answer.to(device), labels.to(device)
            
            optimizer.zero_grad()
            scores = model(question, answer)
            loss = criterion(scores, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f"Epoch [{epoch+1}], Loss: {running_loss/len(train_loader):.4f}")
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for question, answer, labels in validation_loader:
                question, answer, labels = question.to(device), answer.to(device), labels.to(device)
                scores = model(question, answer)
                loss = criterion(scores, labels.unsqueeze(1))
                val_loss += loss.item()
        
        val_loss /= len(validation_loader)
        print(f"Validation Loss: {val_loss:.4f}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            trigger_times = 0
            torch.save(model.state_dict(), 'best_rank_model.pth')
        else:
            trigger_times += 1
            print(f"Trigger Times: {trigger_times}")
            
            if trigger_times >= patience:
                print('Early stopping!')
                return

In [30]:
# Préparation des données
train_dataset = QARankingDataset(train_df)
validation_dataset = QARankingDataset(validation_df)
test_dataset = QARankingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialisation du modèle
input_dim = train_dataset.features.shape[1] // 2  # Divisé par 2 car nous concaténons les questions et les réponses
model = RankSVM(input_dim)

# Définition de la perte et de l'optimiseur
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle
train_ranking_model(model, train_loader, validation_loader, criterion, optimizer, num_epochs=10, patience=5)

# Charger le meilleur modèle sauvegardé
model.load_state_dict(torch.load('best_rank_model.pth'))

Epoch [1], Loss: 0.1131
Validation Loss: 0.1038
Epoch [2], Loss: 0.1014
Validation Loss: 0.1065
Trigger Times: 1
Epoch [3], Loss: 0.0972
Validation Loss: 0.1012
Epoch [4], Loss: 0.0941
Validation Loss: 0.1016
Trigger Times: 1
Epoch [5], Loss: 0.0896
Validation Loss: 0.1037
Trigger Times: 2
Epoch [6], Loss: 0.0885
Validation Loss: 0.1005
Epoch [7], Loss: 0.0858
Validation Loss: 0.0998
Epoch [8], Loss: 0.0840
Validation Loss: 0.1045
Trigger Times: 1
Epoch [9], Loss: 0.0795
Validation Loss: 0.1030
Trigger Times: 2
Epoch [10], Loss: 0.0769
Validation Loss: 0.1026
Trigger Times: 3


<All keys matched successfully>

In [33]:
def compute_ranking_metrics(labels, scores):
    from sklearn.metrics import average_precision_score, label_ranking_average_precision_score
    from scipy.stats import rankdata
    
    # Assuming each label is associated with a unique question ID
    unique_questions = np.unique(labels)
    map_scores = []
    mrr_scores = []
    success_at_1_scores = []
    
    for question in unique_questions:
        question_indices = np.where(labels == question)[0]
        question_scores = scores[question_indices]
        question_labels = labels[question_indices]
        
        ap = average_precision_score(question_labels, question_scores)
        map_scores.append(ap)
        
        # Calculate MRR
        sorted_indices = np.argsort(-question_scores)
        sorted_labels = question_labels[sorted_indices]
        ranks = rankdata(-sorted_labels, method='max')
        mrr = 1.0 / ranks[0]
        mrr_scores.append(mrr)
        
        # Calculate Success@1
        success_at_1 = 1 if sorted_labels[0] == 1 else 0
        success_at_1_scores.append(success_at_1)
    
    map_score = np.mean(map_scores)
    mrr_score = np.mean(mrr_scores)
    success_at_1_score = np.mean(success_at_1_scores)
    
    return map_score, mrr_score, success_at_1_score

def evaluate_model(model, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    all_scores = []
    all_labels = []
    
    with torch.no_grad():
        for question, answer, labels in test_loader:
            question, answer, labels = question.to(device), answer.to(device), labels.to(device)
            scores = model(question, answer)
            all_scores.extend(scores.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_scores = np.array(all_scores).flatten()
    all_labels = np.array(all_labels)
    
    # Calculate ranking metrics such as MAP, MRR, and Success@1
    map_score, mrr_score, success_at_1_score = compute_ranking_metrics(all_labels, all_scores)
    print(f"\nTest Results:\nMAP: {map_score}\nMRR: {mrr_score}\nSuccess@1: {success_at_1_score}\n")



# Évaluer le modèle sur les données de test
evaluate_model(model, test_loader)


Test Results:
MAP: 0.5
MRR: 0.0019494389662252029
Success@1: 0.5





In [34]:
# Entraînement du modèle
train_ranking_model(model, train_loader, validation_loader, criterion, optimizer, num_epochs=10, patience=5)

# Charger le meilleur modèle sauvegardé
model.load_state_dict(torch.load('best_rank_model.pth'))

# Évaluer le modèle sur les données de test
evaluate_model(model, test_loader)


Epoch [1], Loss: 0.0825
Validation Loss: 0.1006
Epoch [2], Loss: 0.0802
Validation Loss: 0.1012
Trigger Times: 1
Epoch [3], Loss: 0.0775
Validation Loss: 0.1007
Trigger Times: 2
Epoch [4], Loss: 0.0738
Validation Loss: 0.1023
Trigger Times: 3
Epoch [5], Loss: 0.0722
Validation Loss: 0.1050
Trigger Times: 4
Epoch [6], Loss: 0.0700
Validation Loss: 0.1063
Trigger Times: 5
Early stopping!

Test Results:
MAP: 0.5
MRR: 0.0019494389662252029
Success@1: 0.5



