In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
# download data from huggingface
newsqa = load_dataset("lucadiliello/newsqa")
newsqa["train"].to_parquet("data/raw/newsqa_train.parquet")
newsqa["validation"].to_parquet("data/raw/newsqa_validation.parquet")

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

13234462

In [3]:
# load data
newsqa_train = pd.read_parquet("data/raw/newsqa_train.parquet")
newsqa_validation = pd.read_parquet("data/raw/newsqa_validation.parquet")
newsqa = pd.concat([newsqa_train, newsqa_validation])

# format answers and drop irrelevant columns
newsqa["answers"] = newsqa["answers"].apply(lambda x: x[0])
newsqa = newsqa.drop(columns=["key", "labels"])
newsqa.columns = ["context", "questions", "answers"]
newsqa

Unnamed: 0,context,questions,answers
0,"NEW DELHI, India (CNN) -- A high court in nort...",What was the amount of children murdered?,19
1,"NEW DELHI, India (CNN) -- A high court in nort...",When was Pandher sentenced to death?,February.
2,"NEW DELHI, India (CNN) -- A high court in nort...",The court aquitted Moninder Singh Pandher of w...,rape and murder
3,"NEW DELHI, India (CNN) -- A high court in nort...",who was acquitted,Moninder Singh Pandher
4,"NEW DELHI, India (CNN) -- A high court in nort...",who was sentenced,Moninder Singh Pandher
...,...,...,...
4207,WASHINGTON (CNN) -- President Obama on Friday ...,What will the new system include?,give detainees greater latitude in selecting l...
4208,NEW YORK (CNN) -- John and Elizabeth Calvert e...,who are growing?,friends
4209,NEW YORK (CNN) -- John and Elizabeth Calvert e...,Where did the couple live?,Hilton Head Island
4210,NEW YORK (CNN) -- John and Elizabeth Calvert e...,when was the last seen of John and Elizabeth?,"March 3,"


In [4]:
# group by context
newsqa = newsqa.groupby("context").agg(
    {
        "questions": lambda x: list(x),
        "answers": lambda x: list(x),
    }
)
newsqa = newsqa.reset_index()
newsqa

Unnamed: 0,context,questions,answers
0,"""Everything can be improved."" -- Ross Lovegrov...","[Who is Ross Lovegrove?, Where is his work hel...","[Designer, in permanent collections of various..."
1,"'SINDH KALAY', England (CNN) -- The aroma of f...","[Where were they being deployed to?, What does...","[Afghanistan's Helmand province, how the peopl..."
2,"(AOL Autos) -- At the 2009 Detroit Auto Show,...","[What debuts at the 2009 Detroit Auto Show?, W...","[electric vehicles, Chrysler, Mercedes-Benz, T..."
3,(AOL Autos) -- Buying a used car may seem lik...,"[What does Texas have?, What will the average ...","[has more used vehicles than any other state, ..."
4,(AOL Autos) -- Collecting cars is an expensiv...,"[What can compete with the Mercedes SL Class?,...","[Cadillac XLR-V, hatchback,, hatchback,, editi..."
...,...,...,...
11470,"Yazoo City, Mississippi (CNN) -- Rescue crews ...","[Where did two sisters die?, What day were tor...","[inside a mobile home, Choctaw County, Saturda..."
11471,"You wanted to know more about greenwashing, an...",[Who is answering the questions on greenwashin...,"[Scot Case,, green, Scot Case,, green products..."
11472,"ZURICH, Switzerland (CNN) -- As I watched Cris...",[Who won the FIFA Player of the Year for 2008 ...,"[Cristiano Ronaldo, Cristiano Ronaldo, Cristia..."
11473,iReporter Doranne Lim chronicles the flooding ...,"[Did the earthquake cause the tsunami?, What l...","[waves triggered by an 8.0, Tsunami, Pasig Cit..."


In [5]:
# create the question-answer column
newsqa["questions_answers"] = newsqa.apply(
    lambda x: " | ".join(
        [
            f"question: {question}, answer: {answer}"
            for question, answer in zip(x["questions"], x["answers"])
        ]
    ),
    axis=1,
)
newsqa 

Unnamed: 0,context,questions,answers,questions_answers
0,"""Everything can be improved."" -- Ross Lovegrov...","[Who is Ross Lovegrove?, Where is his work hel...","[Designer, in permanent collections of various...","question: Who is Ross Lovegrove?, answer: Desi..."
1,"'SINDH KALAY', England (CNN) -- The aroma of f...","[Where were they being deployed to?, What does...","[Afghanistan's Helmand province, how the peopl...","question: Where were they being deployed to?, ..."
2,"(AOL Autos) -- At the 2009 Detroit Auto Show,...","[What debuts at the 2009 Detroit Auto Show?, W...","[electric vehicles, Chrysler, Mercedes-Benz, T...",question: What debuts at the 2009 Detroit Auto...
3,(AOL Autos) -- Buying a used car may seem lik...,"[What does Texas have?, What will the average ...","[has more used vehicles than any other state, ...","question: What does Texas have?, answer: has m..."
4,(AOL Autos) -- Collecting cars is an expensiv...,"[What can compete with the Mercedes SL Class?,...","[Cadillac XLR-V, hatchback,, hatchback,, editi...",question: What can compete with the Mercedes S...
...,...,...,...,...
11470,"Yazoo City, Mississippi (CNN) -- Rescue crews ...","[Where did two sisters die?, What day were tor...","[inside a mobile home, Choctaw County, Saturda...","question: Where did two sisters die?, answer: ..."
11471,"You wanted to know more about greenwashing, an...",[Who is answering the questions on greenwashin...,"[Scot Case,, green, Scot Case,, green products...",question: Who is answering the questions on gr...
11472,"ZURICH, Switzerland (CNN) -- As I watched Cris...",[Who won the FIFA Player of the Year for 2008 ...,"[Cristiano Ronaldo, Cristiano Ronaldo, Cristia...",question: Who won the FIFA Player of the Year ...
11473,iReporter Doranne Lim chronicles the flooding ...,"[Did the earthquake cause the tsunami?, What l...","[waves triggered by an 8.0, Tsunami, Pasig Cit...",question: Did the earthquake cause the tsunami...


In [6]:
# generate train, validation and test sets
len_newsqa = len(newsqa)
val_split = 0.05
test_split = 0.05

index_val = int(len_newsqa * (1 - val_split - test_split))
index_test = int(len_newsqa * (1 - test_split))

newsqa_train = newsqa.iloc[:index_val]
newsqa_validation = newsqa.iloc[index_val:index_test]
newsqa_test = newsqa.iloc[index_test:]

In [7]:
# save data
newsqa_train.to_parquet("data/processed/newsqa_train.parquet")
newsqa_validation.to_parquet("data/processed/newsqa_validation.parquet")
newsqa_test.to_parquet("data/processed/newsqa_test.parquet")