In [37]:
import json
from collections import Counter
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense
import numpy as np
import re
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [38]:
# Function to preprocess and tokenize text

def preprocess(text):
    # Remove punctuation and numbers, tokenize, and convert to lower case
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return word_tokenize(text.lower())

# Function to load JSON data
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to analyze documents
def analyze_documents(data):
    max_length = 0  # To store the maximum passage length
    vocabulary = Counter()  # To store and count unique tokens across all passages

    # Iterate through each document's passages
    for doc_id, passages in data.items():
        for passage_id, passage_text in passages.items():
            tokens = preprocess(passage_text)
            max_length = max(max_length, len(tokens))  # Update max_length if current passage is longer
            vocabulary.update(tokens)  # Update the vocabulary counter

    vocab_size = len(vocabulary)  # Number of unique tokens
    return max_length, vocab_size

In [39]:
json_file_path = 'WikiPassageQA/document_passages.json'
data = load_data(json_file_path)
max_passage_length, vocab_size = analyze_documents(data)

# Print the results
print(f"Maximum passage length: {max_passage_length}")
print(f"Vocabulary size: {vocab_size}")


Maximum passage length: 1321
Vocabulary size: 155338


In [40]:
def create_all_texts(data):
    all_texts = []
    for doc_passages in data.values():
        for text in doc_passages.values():
            all_texts.append(text)
    return all_texts

all_texts = create_all_texts(data)

In [41]:
# les parametres
MAX_LEN = max_passage_length  
VOCAB_SIZE = vocab_size

# on prepare les labels
# on va faire un dico avec la query en id et en valeur un tuple avec le document id et les relevantpassages
def load_and_process_data(filepath):
    # Load the data from a tab-separated file
    df = pd.read_csv(filepath, sep='\t', header=0)
    
    print("Columns in the dataframe:", df.columns)

    data_dict = {}
    # Using vectorized operations over iterrows for better performance
    for index, row in df.iterrows():
        try:
            # Ensure 'RelevantPassages' are split and converted to integers correctly
            relevant_passages = list(map(int, row['RelevantPassages'].split(',')))
            # Store the question as key, and tuple of DocumentID and relevant passages list as value
            data_dict[row['Question']] = (int(row['DocumentID']), relevant_passages)
        except ValueError as e:
            print(f"Error processing row {index}: {e}")
            continue  # Skip rows with errors

    return data_dict

# fichier avec le train, on fera pareil avec le test apres du coup
filepath = 'WikiPassageQA/train.txt'  
df = load_and_process_data(filepath)

# test pour voir si ca marche (et oui ca marche)
for key, value in list(df.items())[:5]:
    print(f"Question: {key}\nDocumentID and RelevantPassages: {value}\n")


Columns in the dataframe: Index(['QID', 'Question', 'DocumentID', 'DocumentName', 'RelevantPassages'], dtype='object')
Question: What is the role of conversionism in Evangelicalism?
DocumentID and RelevantPassages: (672, [4])

Question: How did the assault on the Bastille the first year of the Revolution ultimately culminate into the capture & execution of Louis XVI in January 1793?
DocumentID and RelevantPassages: (359, [1, 2])

Question: What is the prehistory of Albania?
DocumentID and RelevantPassages: (285, [4])

Question: What significance did Bulgaria have in the ending of World War I?
DocumentID and RelevantPassages: (579, [14])

Question: What is the rationale of support of the Common Era?
DocumentID and RelevantPassages: (204, [9])



In [42]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(all_texts)  # Fit the tokenizer on all extracted texts

In [43]:
print(df)
questions = [question for question in df]  # List of questions
sequences = tokenizer.texts_to_sequences(questions)  # Convert texts to sequences
X_questions = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')  # Pad sequences

print(X_questions)

{'What is the role of conversionism in Evangelicalism?': (672, [4]), 'How did the assault on the Bastille the first year of the Revolution ultimately culminate into the capture & execution of Louis XVI in January 1793?': (359, [1, 2]), 'What is the prehistory of Albania?': (285, [4]), 'What significance did Bulgaria have in the ending of World War I?': (579, [14]), 'What is the rationale of support of the Common Era?': (204, [9]), "What has characterized Indonesia's foreign relations since the New Order era?": (2, [16]), 'How violent is the Pacific Ocean?': (430, [10, 11]), 'Why is Sub-Saharan Africa considered to have a paradoxical birth rate?': (341, [7, 8]), "What were the effects of the Napoleonic Wars on Britain's empire?": (420, [22, 23]), 'According to several economists, how can the HDI be a misguided statistic?': (561, [15]), 'How do the boundaries of the Appalachians vary according to different people?': (765, [0]), 'What influence have native peoples of Colorado had on the h

In [44]:
NUM_PASSAGES = 118 # max ici

Y_passage_labels = np.zeros((len(df), NUM_PASSAGES), dtype=int)
Y_document_labels = np.zeros(len(df), dtype=int)

questions = list(df.keys())  # Assuming questions are unique
for i, question in enumerate(questions):
    doc_id, relevant_passages = df[question]
    
    # Set the indices for relevant passages to 1
    Y_passage_labels[i, relevant_passages] = 1

    # Set the document ID, converting document ID to an index if necessary
    Y_document_labels[i] = doc_id  # Ensure this is an index if not already

print(Y_passage_labels)
print()
print(Y_document_labels)

[[0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

[672 359 285 ... 641 740 186]
