In [None]:
!pip install -qqq -U wandb --progress-bar off
!pip install datasets --progress-bar off
!pip install transformers --progress-bar off
import wandb
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

# Remove tokens

In [None]:
from datasets import load_dataset

mrqa = load_dataset("mrqa")
mrqa

In [None]:
mrqa = mrqa.remove_columns(['context_tokens', 'question_tokens'])
mrqa

In [None]:
# uncomment to publish the processed dataset
#mrqa.push_to_hub("mrqa-no-tokens")

# To Squad format

In [None]:
mymrqa = load_dataset('enriquesaou/mrqa-no-tokens')
mymrqa

In [None]:
def convert_dataset_format(input_data):
    converted_data = {
        "answers": {
            "answer_start": [span['start'][0] for span in input_data['detected_answers']['char_spans']],
            "text": input_data['answers']
        },
        "context": input_data['context'],
        "id": input_data['qid'],
        "question": input_data['question'],
    }
    return converted_data

converted_data = mymrqa.map(convert_dataset_format)
print(converted_data, mymrqa)

In [None]:
converted_data = converted_data.remove_columns(['detected_answers', 'qid'])

In [None]:
converted_data

In [None]:
print(converted_data['train'][0])

In [None]:
# uncomment to publish
#converted_data.push_to_hub("mrqa-squadded")

# Sample it

In [None]:
from datasets import load_dataset

mrqa = load_dataset("enriquesaou/mrqa-squadded").shuffle(seed=27)
mrqa

In [None]:
mrqa['train'] = mrqa['train'].select(range(15000))
mrqa['validation'] = mrqa['validation'].select(range(1500))
mrqa['test'] = mrqa['test'].select(range(500))

In [None]:
mrqa

In [None]:
# uncomment to publish the sample
#mrqa.push_to_hub("mrqa-squadded-sample")

# Context token length stats

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tok_dataset):
    fontsize=12
    lengths = [len(x['input_ids']) for x in tok_dataset]
    print(len(lengths))
    print(lengths)

    plt.figure(figsize=(10, 6))
    plt.hist(lengths)
    plt.xlabel('Input length (# of tokens)', fontsize=fontsize)
    plt.ylabel('# of examples', fontsize=fontsize)
    plt.yticks(fontsize=fontsize)
    plt.xticks(fontsize=fontsize)
    plt.show()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

In [None]:
from datasets import load_dataset

mrqa = load_dataset("enriquesaou/mrqa-squadded-sample")

In [None]:
def tokenize_prompt(data_point):
    def format_cqa(context, question):
        return "Answer the question extracting from the context below.\nContext: " + context + "\nQuestion: " + question + "\nAnswer: "

    full_prompt = format_cqa(data_point['context'], data_point['question']) + data_point['answers']['text'][0]
    result = tokenizer(full_prompt)
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = mrqa.map(tokenize_prompt,
                             remove_columns=mrqa['train'].column_names)

In [None]:
plot_data_lengths(tokenized_dataset['train']), plot_data_lengths(tokenized_dataset['validation']), plot_data_lengths(tokenized_dataset['test'])

# Answer token length stats

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

In [None]:
from datasets import load_dataset
dataset = load_dataset('enriquesaou/mrqa-squadded-sample', split='test')

anslen = 0
nans = 0
answerlen = {}
for example in dataset:
    for answer in tokenizer(example['answers']['text'])['input_ids']:
        nans = nans + 1
        anslen = anslen + len(answer)
        answerlen[len(answer)] = answerlen.setdefault(len(answer), 0) + 1

print(f"Average ans len {anslen/nans}")
print()

import matplotlib.pyplot as plt

sorted_data = {key: answerlen[key] for key in sorted(answerlen, reverse=False)}

keys = list(sorted_data.keys())
values = list(sorted_data.values())

plt.figure(figsize=(10, 5))
plt.bar(keys, values)
plt.xlabel('Answer length (# of tokens)', fontsize=20)
plt.ylabel('# of answers', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlim(0,30)



plt.show()

In [None]:
from datasets import load_dataset
dataset = load_dataset('enriquesaou/mrqa-squadded-sample', split='validation')


anslen = 0
nans = 0
answerlen = {}
for example in dataset:
    for answer in tokenizer(example['answers']['text'])['input_ids']:
        nans = nans + 1
        anslen = anslen + len(answer)
        answerlen[len(answer)] = answerlen.setdefault(len(answer), 0) + 1

print(f"Average ans len {anslen/nans}")
print()

import matplotlib.pyplot as plt

sorted_data = {key: answerlen[key] for key in sorted(answerlen, reverse=False)}

keys = list(sorted_data.keys())
values = list(sorted_data.values())

plt.figure(figsize=(10, 5))
plt.bar(keys, values)
plt.xlabel('Answer length (# of tokens)', fontsize=20)
plt.ylabel('# of answers', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlim(0,30)
plt.show()

# Subset stats

In [None]:
from datasets import load_dataset
dataset = load_dataset('enriquesaou/mrqa-squadded')

In [None]:
subsets = {}
for split in dataset:
    for example in dataset[split]:
        subsets[example['subset']] = subsets.setdefault(example['subset'], 0) + 1

print(subsets)

In [None]:
dataset

# Pie plots subset distribution
Note that this was not displayed in the final work

In [None]:
def pie_subsets(dataset):
    subset_counts = {}
    for subset in dataset['subset']:
        subset_counts[subset] = subset_counts.setdefault(subset, 0) + 1
    subset_counts = dict(sorted(subset_counts.items()))
    print(subset_counts)

    import matplotlib.pyplot as plt

    labels = list(subset_counts.keys())
    sizes = list(subset_counts.values())

    # colors!
    custom_colors = ['#8c564b', '#2ca02c', '#d62728', '#1f77b4', '#ff7f0e', '#9467bd']
    color_map = {key: custom_colors[i % len(custom_colors)] for i, key in enumerate(sorted(labels))}
    chart_colors = [color_map[label] for label in labels]

    fig, ax = plt.subplots()
    ax.pie(sizes, labels=labels, colors=chart_colors, autopct='%1.1f%%', startangle=90)
    ax.axis('equal')  # circle!

    plt.show()

In [None]:
from datasets import load_dataset

dataset = load_dataset('enriquesaou/mrqa-squadded-sample', split='train')
pie_subsets(dataset)

dataset = load_dataset('mrqa', split='train')
pie_subsets(dataset)

In [None]:
dataset = load_dataset('enriquesaou/mrqa-squadded-sample', split='validation')
pie_subsets(dataset)

dataset = load_dataset('mrqa', split='validation')
pie_subsets(dataset)

In [None]:
dataset = load_dataset('enriquesaou/mrqa-squadded-sample', split='test')
pie_subsets(dataset)

dataset = load_dataset('mrqa', split='test')
pie_subsets(dataset)

# Bar plot subset distribution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def grouped_bar_subsets(dataset1, dataset2, dataset1name='Sample', dataset2name='Original'):
    subset_counts1 = {}
    for subset in dataset1['subset']:
        subset_counts1[subset] = subset_counts1.setdefault(subset, 0) + 1
    subset_counts1 = dict(sorted(subset_counts1.items()))

    subset_counts2 = {}
    for subset in dataset2['subset']:
        subset_counts2[subset] = subset_counts2.setdefault(subset, 0) + 1
    subset_counts2 = dict(sorted(subset_counts2.items()))

    labels = sorted(set(subset_counts1.keys()).union(subset_counts2.keys()))

    total_counts1 = sum(subset_counts1.values())
    total_counts2 = sum(subset_counts2.values())

    percentages1 = [subset_counts1.get(label, 0) / total_counts1 * 100 for label in labels]
    percentages2 = [subset_counts2.get(label, 0) / total_counts2 * 100 for label in labels]

    x = np.arange(len(labels)) * 2  # increase spacing between the labels
    width = 0.6  # width of the bars

    custom_colors1 = ['#1f77b4']
    custom_colors2 = ['#ff7f0e']

    fig, ax = plt.subplots(figsize=(20, 10))  # figure size for better spacing
    bars1 = ax.bar(x - width/2, percentages1, width, label=dataset1name, color=custom_colors1[0])
    bars2 = ax.bar(x + width/2, percentages2, width, label=dataset2name, color=custom_colors2[0])

    ax.set_xlabel('Subsets', fontsize=20)
    ax.set_ylabel('Percentage distribution', fontsize=20)
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=20)
    ax.set_yticklabels(range(0,35, 5), fontsize=20)
    ax.legend(fontsize=20)

    plt.show()

In [None]:
from datasets import load_dataset

mydata = load_dataset('enriquesaou/mrqa-squadded-sample', split='train')
ogdata = load_dataset('mrqa', split='train')
grouped_bar_subsets(mydata, ogdata)

In [None]:
from datasets import load_dataset

mydata = load_dataset('enriquesaou/mrqa-squadded-sample', split='validation')
ogdata = load_dataset('mrqa', split='validation')
grouped_bar_subsets(mydata, ogdata)

In [None]:
from datasets import load_dataset

mydata = load_dataset('enriquesaou/mrqa-squadded-sample', split='test')
ogdata = load_dataset('mrqa', split='test')
grouped_bar_subsets(mydata, ogdata)