In [11]:
import json
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [12]:
# Pour chaquer fichier (dev, test, train), on crée un dictionnaire ayant pr clés question id, doc id...

def data(type): # type peut être dev, test ou train.

    # Define the path to your text document
    file_path = rf'WikiPassageQA\{type}.txt'
    # Initialize an empty dictionary to hold your data
    data_dict = {}
    articles_with_ids = {}

    # Open the text document for reading
    with open(file_path, 'r') as file:
            # Skip the header line
            next(file)
            # Iterate over each line in the file
            for line in file:
                # Split the line into components based on tabs
                parts = line.strip().split('\t')
                # Extract the individual components
                qid, question, doc_id, doc_name, rel_passages = parts
                # Convert the QID to an integer (if you want it as an integer)
                qid = int(qid)
                # Convert DocumentID to an integer (if needed)
                doc_id = int(doc_id)
                # Split 'RelevantPassages' into a list of integers (if they are always numbers)
                rel_passages = [int(x) for x in rel_passages.split(',')]
                # Populate the dictionary
                data_dict[qid] = {
                    'Question': question,
                    'DocumentID': doc_id,
                    'DocumentName': doc_name,
                    'RelevantPassages': rel_passages
                }
                articles_with_ids[doc_id] = doc_name[:-5]
        
    return data_dict, articles_with_ids

# At this point, data_dict contains your data structured as required
# articles_with_ids : dict mapping document IDs to their Wikipedia article titles

data_dict, _ = data("train")

In [13]:
def load_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

In [14]:
print(data_dict.keys())
print(data_dict[0])

dict_keys([3086, 195, 557, 1508, 956, 1993, 2260, 2678, 2165, 2079, 1287, 2958, 4002, 1634, 2610, 3381, 3534, 511, 1824, 25, 75, 1435, 1545, 1130, 1372, 458, 2450, 3203, 1831, 4033, 3565, 2938, 2822, 3295, 1205, 1705, 3318, 2497, 892, 3200, 861, 3163, 3047, 3468, 3471, 774, 1317, 3038, 367, 3774, 4185, 3796, 269, 3546, 643, 1096, 3681, 3612, 2106, 3605, 3347, 241, 3091, 3384, 288, 622, 2483, 2707, 1070, 639, 4142, 1510, 3221, 3946, 4179, 2118, 777, 99, 4053, 3055, 3331, 3788, 320, 698, 1076, 2834, 409, 1081, 102, 237, 2525, 3077, 2672, 678, 1537, 2043, 1859, 2088, 719, 3402, 4177, 2844, 463, 2811, 4036, 907, 3044, 3066, 312, 3075, 3230, 2126, 3779, 2594, 1541, 1083, 2414, 853, 1260, 1758, 37, 1901, 826, 605, 4156, 3043, 2942, 92, 3358, 1512, 3847, 3728, 1663, 2812, 2535, 15, 3595, 1371, 1822, 2587, 2712, 4091, 1629, 480, 2975, 2613, 3621, 2061, 1926, 896, 3019, 756, 4129, 219, 2350, 2294, 744, 2772, 1980, 270, 1586, 1422, 3427, 3797, 950, 3597, 1544, 3916, 439, 38, 3694, 1862, 2777, 14

In [48]:
filepath = 'WikiPassageQA/document_passages_tokenized_2.json'
doc_passages = load_data(filepath)

print(doc_passages['267'])

{'56': ['reconstruction', 'needed', 'because', 'of', 'the', 'ongoing', 'civil', 'war', 'will', 'cost', 'a', 'much', 'a', 'u', 'billion', 'sanction', 'have', 'sapped', 'the', 'government', 'finance', 'u', 'and', 'european', 'union', 'ban', 'on', 'oil', 'import', 'which', 'went', 'into', 'effect', 'in', 'are', 'estimated', 'to', 'cost', 'syria', 'about', 'million', 'a', 'month', 'revenue', 'from', 'tourism', 'have', 'dropped', 'dramatically', 'with', 'hotel', 'occupancy', 'rate', 'falling', 'from', 'before', 'the', 'war', 'to', 'le', 'than', 'in', 'may', 'around', 'of', 'all', 'employee', 'in', 'the', 'tourism', 'sector', 'have', 'lost', 'their', 'job', 'since', 'the', 'beginning', 'of', 'the', 'war', 'in', 'may', 'isi', 'captured', 'syria', 'phosphate', 'mine', 'one', 'of', 'the', 'assad', 'regime', 'last', 'chief', 'source', 'of', 'income'], '54': ['poverty', 'rate', 'have', 'increased', 'from', 'in', 'to', 'in', 'in', 'syria', 'main', 'export', 'include', 'crude', 'oil', 'refined', 'p

In [58]:
def shuffle_list(L1, L2):
    # Créer une liste de la même taille que L1, initialisée avec None ou une valeur neutre
    result = [None] * len(L1)
    
    # Assigner chaque élément de L1 à la nouvelle position indiquée par L2
    for original_index, new_index in enumerate(L2):
        result[new_index] = L1[original_index]
    
    return result

Premier test de BM25 directement sur document_passages_tokenized_2.json (donc de la triche, car il faut tester sur notre dictionnaire acquis via scraping (articles wiki décomposés en passages de 6 sentences))

In [57]:
def fetch_passages(doc_id, doc_passages):
    # Récupérer les passages d'un document spécifique
    passages = [doc_passages[doc_id][pid] for pid in doc_passages[doc_id]]
    pid = [int(pid) for pid in doc_passages[doc_id]]
    passages = shuffle_list(passages,pid)
    return passages

def rank_passages(query, passages):
    # Tokeniser la requête si ce n'est pas déjà une liste de mots
    tokenized_query = query.lower().split()

    # Créer l'objet BM25 avec les passages tokenisés
    bm25 = BM25Okapi(passages)

    # Calculer les scores pour la requête
    scores = bm25.get_scores(tokenized_query)
    # print(scores)
    
    # Trier les passages par score en ordre décroissant
    ranked_passages = np.argsort(scores)[::-1]
    
    return ranked_passages, scores

# Supposons que nous traitons la question ID 0 dans data_dict
question_data = data_dict[0]
query = question_data['Question']
doc_id = str(question_data['DocumentID'])  # Assurez-vous que l'ID est en format string si nécessaire

# Récupérer les passages du document spécifique
passages = fetch_passages(doc_id, doc_passages)

# Classement des passages pour la requête
ranked_passages, scores = rank_passages(query, passages)

# Affichage des résultats
for idx in ranked_passages:
    print(f"Passage ID: {idx}, Score: {scores[idx]}, Text: {' '.join(passages[idx])}")


Passage ID: 27, Score: 4.996060310003282, Text: in november a a direct result of the suez crisis syria signed a pact with the soviet union this gave a foothold for communist influence within the government in exchange for military equipment turkey then became worried about this increase in the strength of syrian military technology a it seemed feasible that syria might attempt to retake iskenderun only heated debate in the united nation lessened the threat of war on february syrian president shukri and egypt nasser announced the merging of egypt and syria creating the united arab republic and all syrian political party a well a the communist therein ceased overt activity meanwhile a group of syrian officer alarmed by the party poor position and the increasing fragility of the union decided to form a secret military committee it initial member were muhammad umran major salah jadid and captain hafez
Passage ID: 15, Score: 4.499334523815813, Text: his cousin elagabalus who wa emperor from

Ca marche pas mal, ici les relevant passages étaient 15 et 16.

Passons maintenant au réel test, avec comme critère la correspondance par bigramme 

In [56]:
from nltk import bigrams
from collections import Counter

def calculate_bigram_overlap(doc1, doc2):
    # Générer des bigrammes pour chaque document
    bigrams1 = list(bigrams(doc1))
    bigrams2 = list(bigrams(doc2))
    
    # Compter les bigrammes dans chaque document
    bigram_counts1 = Counter(bigrams1)
    bigram_counts2 = Counter(bigrams2)
    
    # Trouver les bigrammes communs et calculer le nombre total de bigrammes partagés
    common_bigrams = bigram_counts1 & bigram_counts2
    total_shared = sum(common_bigrams.values())
    
    # Calculer le nombre total de bigrammes dans le plus petit document (pour calculer le pourcentage de recouvrement)
    total_bigrams = min(len(bigrams1), len(bigrams2))
    
    # Calculer le pourcentage de recouvrement
    if total_bigrams > 0:
        overlap_percentage = (total_shared / total_bigrams) * 100
    else:
        overlap_percentage = 0
    
    # Retourner 1 si le pourcentage est supérieur à 15%, sinon 0
    return 1 if overlap_percentage > 15 else 0


1


In [59]:
filepath = './data_tokenized_2.json'
wiki_docs = load_data(filepath)

In [73]:
print(wiki_docs.keys())
print(wiki_docs["267"])

dict_keys(['672', '359', '285', '579', '204', '2', '430', '341', '420', '561', '765', '189', '817', '328', '97', '742', '603', '788', '6', '135', '512', '263', '462', '598', '348', '478', '547', '707', '59', '353', '600', '757', '480', '700', '206', '807', '852', '23', '310', '612', '588', '526', '734', '824', '237', '283', '559', '804', '661', '544', '558', '479', '79', '465', '110', '311', '516', '724', '426', '491', '616', '781', '56', '266', '175', '472', '585', '314', '744', '562', '844', '254', '317', '299', '521', '775', '257', '64', '610', '1', '680', '433', '840', '15', '417', '260', '854', '255', '514', '120', '129', '365', '413', '33', '436', '779', '477', '640', '719', '198', '344', '519', '693', '9', '119', '381', '522', '557', '468', '67', '473', '753', '0', '784', '760', '117', '535', '48', '74', '225', '83', '727', '704', '139', '130', '52', '801', '167', '714', '568', '185', '42', '17', '318', '12', '684', '660', '584', '30', '628', '137', '454', '252', '31', '8', '155

In [77]:
# Supposons que nous traitons la question ID 0 dans data_dict
question_data = data_dict[0]
query = question_data['Question']
doc_id = str(question_data['DocumentID'])  # Assurez-vous que l'ID est en format string si nécessaire
pertinents = question_data['RelevantPassages']

passages = wiki_docs[doc_id]
# Classement des passages pour la requête
ranked_passages, scores = rank_passages(query, passages)

# Affichage des résultats
for idx in ranked_passages:
    per = 0
    for rel_id in pertinents:
        if calculate_bigram_overlap(passages[idx],doc_passages[doc_id][str(rel_id)]):
            per = 1
    print(f"Passage ID: {idx}, Pertinent: {per}, Score: {scores[idx]}, Text: {' '.join(passages[idx])}")

Passage ID: 18, Pertinent: 1, Score: 6.059034772552553, Text: syria is significant in the history of christianity saulus of tarsus better known a the apostle paul wa converted on the road to damascus and emerged a a significant figure in the christian church at antioch in ancient syria from which he left on many of his missionary journey act middle age muhammad first interaction with the people and tribe of syria wa during the invasion of dumatul jandal in july where he ordered his follower to invade duma because muhammad received intelligence that some tribe there were involved in highway robbery and preparing to attack medina itself william montgomery watt claim that this wa the most significant expedition muhammad ordered at the time even though it received little notice in the primary source dumat wa kilometre mi from medina and watt say that there wa no immediate threat to muhammad other than the possibility that his communication to syria and supply to medina being interrupted wa