In [9]:
import datasets
import wikipediaapi
import pandas as pd

In [23]:
wikiqa_data = datasets.load_from_disk("wikiqa")
wikiqa_data_train = wikiqa_data["train"]
wikiqa_data_train = wikiqa_data["test"]
wikiqa_data_train = wikiqa_data["validation"] 


In [5]:
# Réduire la classe majoritaire et suréchantillonner la classe minoritaire
def reduce_and_balance_classes(df):
    df_majority = df[df.label == 0]
    df_minority = df[df.label == 1]
    
    # Réduire la classe majoritaire de 75 %
    df_majority_reduced = resample(df_majority, 
                                   replace=False,    # échantillonner sans remplacement
                                   n_samples=int(len(df_majority) * 0.25),  # 25 % de la classe majoritaire
                                   random_state=123) # pour la reproductibilité
    
    # Suréchantillonner la classe minoritaire pour correspondre à la taille de la classe majoritaire réduite
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # échantillonner avec remplacement
                                     n_samples=len(df_majority_reduced),    # pour faire correspondre la classe majoritaire réduite
                                     random_state=123) # pour la reproductibilité
    
    return pd.concat([df_majority_reduced, df_minority_upsampled])

balanced_train_df = reduce_and_balance_classes(train_df)
balanced_validation_df = reduce_and_balance_classes(validation_df)
balanced_test_df = reduce_and_balance_classes(test_df)

'Glacier cave'

In [17]:
# Initialiser l'API Wikipedia

user_agent = "WikiQAResearchBot/1.0 "
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})

def get_wikipedia_context(title):
    page = wiki_wiki.page(title)
    if page.exists():
        return page.summary
    else:
        return None

# Ajouter une colonne 'context' avec le contexte de chaque document_title
df = pd.DataFrame(wikiqa_data_train[0:5])
test = df["document_title"].apply(get_wikipedia_context)

# Afficher le dataframe mis à jour
print(test)

0    A glacier cave is a cave formed within the ice...
1    A glacier cave is a cave formed within the ice...
2    A glacier cave is a cave formed within the ice...
3    A glacier cave is a cave formed within the ice...
4    A glacier cave is a cave formed within the ice...
Name: document_title, dtype: object


In [24]:
import pandas as pd
import wikipediaapi
from datasets import DatasetDict, Dataset
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Initialiser l'API Wikipedia avec un agent utilisateur
user_agent = "WikiQAResearchBot/1.0"
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})

def get_wikipedia_context(title):
    page = wiki_wiki.page(title)
    if page.exists():
        return page.summary
    else:
        return None

def fetch_wikipedia_summaries(titles):
    context_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_title = {executor.submit(get_wikipedia_context, title): title for title in titles}
        for future in tqdm(as_completed(future_to_title), total=len(titles), desc="Fetching Wikipedia summaries"):
            title = future_to_title[future]
            try:
                context_dict[title] = future.result()
            except Exception as exc:
                print(f"{title} generated an exception: {exc}")
                context_dict[title] = None
    return context_dict

def add_context_column(dataset_dict):
    # Récupérer tous les titres de documents uniques
    all_titles = set()
    for subset in dataset_dict.values():
        all_titles.update(subset['document_title'])

    # Faire une requête pour chaque titre unique
    context_dict = fetch_wikipedia_summaries(all_titles)

    # Ajouter la colonne 'context' à chaque subset du dataset
    for key in dataset_dict.keys():
        dataset_dict[key] = dataset_dict[key].map(lambda example: {'context': context_dict[example['document_title']]})
    
    return dataset_dict

# Ajouter la colonne context à chaque subset du dataset
dataset = add_context_column(wikiqa_data)

# Afficher les résultats pour vérification
for split in dataset.keys():
    print(f"=== {split.upper()} ===")
    print(dataset[split].to_pandas())


Fetching Wikipedia summaries: 100%|██████████| 2811/2811 [02:29<00:00, 18.81it/s]
Map: 100%|██████████| 6165/6165 [00:00<00:00, 11239.58 examples/s]
Map: 100%|██████████| 2733/2733 [00:00<00:00, 23007.10 examples/s]
Map: 100%|██████████| 20360/20360 [00:00<00:00, 25699.63 examples/s]


=== TEST ===
     question_id                                         question  \
0             Q0  HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US   
1             Q0  HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US   
2             Q0  HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US   
3             Q0  HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US   
4             Q0  HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US   
...          ...                                              ...   
6160       Q3045                            what is an open mare?   
6161       Q3045                            what is an open mare?   
6162       Q3045                            what is an open mare?   
6163       Q3045                            what is an open mare?   
6164       Q3045                            what is an open mare?   

                                document_title  \
0     African immigration to the United States   
1     African immigration to the United States   
2     Af

In [32]:
dataset.save_to_disk('wikiqa_context')

Saving the dataset (1/1 shards): 100%|██████████| 6165/6165 [00:00<00:00, 680667.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2733/2733 [00:00<00:00, 286905.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 20360/20360 [00:00<00:00, 813078.70 examples/s]


In [33]:
new_df = datasets.load_from_disk("wikiqa_context")

In [34]:
new_df

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label', 'context'],
        num_rows: 6165
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label', 'context'],
        num_rows: 2733
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label', 'context'],
        num_rows: 20360
    })
})