# Google natural questions (scraped)
This notebooks aims to answer the following research question: **Which natural questions do users make when making dictionary-based queries**

In [1]:
!pip3 install gsutil



In [2]:
from tools.datasets import *
import fnmatch
import os

In [3]:
googlenlq_dir = os.path.join(DATA_FOLDER, "GoogleNLQ")
os.makedirs(googlenlq_dir, exist_ok=True)
# Be wary! This will download the full dataset of 41 GBs.
!gsutil -m cp -R gs://natural_questions/v1.0 $googlenlq_dir

Copying gs://natural_questions/v1.0/LICENSE.txt...
Copying gs://natural_questions/v1.0/README.txt...                               
Copying gs://natural_questions/v1.0/dev/nq-dev-00.jsonl.gz...                   
Copying gs://natural_questions/v1.0/dev/nq-dev-01.jsonl.gz...                   
Copying gs://natural_questions/v1.0/dev/nq-dev-02.jsonl.gz...                   
Copying gs://natural_questions/v1.0/dev/nq-dev-03.jsonl.gz...                   
Copying gs://natural_questions/v1.0/dev/nq-dev-04.jsonl.gz...                   
Copying gs://natural_questions/v1.0/sample/nq-dev-sample.jsonl.gz...            
Copying gs://natural_questions/v1.0/sample/nq-train-sample.jsonl.gz...          
Copying gs://natural_questions/v1.0/train/nq-train-00.jsonl.gz...               
Copying gs://natural_questions/v1.0/train/nq-train-01.jsonl.gz...               
Copying gs://natural_questions/v1.0/train/nq-train-02.jsonl.gz...
Copying gs://natural_questions/v1.0/train/nq-train-04.jsonl.gz...
Copying

In [4]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [69]:
from pyspark.sql.functions import explode, lower

In [66]:
# load the first 15 slices only

googlenlq_filename = [f"data/GoogleNLQ/v1.0/train/nq-train-%.2d.jsonl.gz" % i for i in range(15)]

nlq_df = spark.read.json(googlenlq_filename)

In [26]:
nlq_df.printSchema()

root
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotation_id: decimal(20,0) (nullable = true)
 |    |    |-- long_answer: struct (nullable = true)
 |    |    |    |-- candidate_index: long (nullable = true)
 |    |    |    |-- end_byte: long (nullable = true)
 |    |    |    |-- end_token: long (nullable = true)
 |    |    |    |-- start_byte: long (nullable = true)
 |    |    |    |-- start_token: long (nullable = true)
 |    |    |-- short_answers: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- end_byte: long (nullable = true)
 |    |    |    |    |-- end_token: long (nullable = true)
 |    |    |    |    |-- start_byte: long (nullable = true)
 |    |    |    |    |-- start_token: long (nullable = true)
 |    |    |-- yes_no_answer: string (nullable = true)
 |-- document_html: string (nullable = true)
 |-- document_title: string (nullable = true)
 |-- document_to

In [27]:
nlq_df.createOrReplaceTempView("NLQ")

res = spark.sql("SELECT document_title, question_text from NLQ LIMIT 100")

In [28]:
res.show(truncate=False)

+------------------------------------------+-----------------------------------------------------------------------------+
|document_title                            |question_text                                                                |
+------------------------------------------+-----------------------------------------------------------------------------+
|United States twenty-dollar bill          |who appeared in the $20 bill in 1875                                         |
|All-India Muslim League                   |who presided over the first session of muslim league at dhaka                |
|I Need Love                               |ll cool j in the back of my mind                                             |
|Gospel of Matthew                         |who is generally accepted to be the author of the gospel of matthew          |
|Fuller House (TV series)                  |why are there only nine episodes of fuller house season 3                    |
|2018 FIFA World

In [29]:
#nlq_df.select(explode("annotations")).select("col.*").select("long_answer.candidate_index").limit(10).collect()
nlq_df.select("question_text", explode("long_answer_candidates")) \
      .select("question_text", "col.start_token", "col.end_token") \
      .limit(10) \
      .collect()
      

[Row(question_text='who appeared in the $20 bill in 1875', start_token=22, end_token=151),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=25, end_token=33),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=33, end_token=41),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=41, end_token=50),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=50, end_token=59),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=59, end_token=70),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=70, end_token=84),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=84, end_token=96),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=96, end_token=101),
 Row(question_text='who appeared in the $20 bill in 1875', start_token=105, end_token=114)]

In [30]:
ris = nlq_df.select("question_text").limit(1)

In [31]:
r = ris.collect()

In [32]:
r[0].asDict()["question_text"]

'who appeared in the $20 bill in 1875'

In [33]:
sampled_questions = nlq_df.select("question_text") \
                          .filter(nlq_df.question_text.contains("how do you say")) \
                          .sample(withReplacement=False, fraction=1.).collect()

In [34]:
for question in sampled_questions:
    print(question)

Row(question_text='how do you say alice in wonderland in spanish')
Row(question_text='how do you say horse meat in french')
Row(question_text='how do you say bless you in french when someone sneezes')
Row(question_text='how do you say blue in all languages')
Row(question_text='how do you say great britain in french')
Row(question_text='how do you say son in law in korean')
Row(question_text='how do you say bless you in italian')
Row(question_text='how do you say i like to play video games in french')


In [37]:
from tools.datasets import *
from tools.sparql_wrapper import SPARQLDataProviders

class WikidataQuery(SPARQLDataProviders):
    """Wrapper to make SPARQL queries to Wikidata"""
    def __init__(self):
        self.sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        
    @property
    def prefix(self):
        return ""
wikidata_sparql = WikidataQuery()

language_list = wikidata_sparql.run_query("""
SELECT DISTINCT ?lang

WHERE
{
    ?langEntity wdt:P31 wd:Q34770;
                  wdt:P1098 ?num.
    ?langEntity rdfs:label ?lang.
    FILTER(LANG(?lang) = "en").
}

ORDER BY DESC(?num)
LIMIT 30
""")

In [58]:
from functools import reduce

most_common_languages = set(language_list["lang.value"].str.lower().array)

how_do_you_say = {"how do you say", "how does one say",
                      "what is the translation of",
                      "how does one translate"}

in_lang = {"in " + lang for lang in most_common_languages }.union({'into ' + lang for lang in most_common_languages})

keywords = {'translate', 'definition', 'mean', 'meaning',
              'singular', 'plural', 'conjugation', 'conjugate', 'declinate',
              'noun', 'verb', 'adjective', 'pronoun', 'comparative',
              'superlative', 'irregular', 'definition', 'synonyms', 'language',
              'linguistic'}

search_terms = reduce(lambda a, b: a.union(b), [most_common_languages, how_do_you_say,
                                                         in_lang, keywords])

search_terms_regex = "|".join(search_terms)

In [73]:
def contains(column):
    return any([expression in column.question_text for expression in search_terms])

returned_rows = nlq_df.select("question_text") \
      .where(lower(nlq_df.question_text).rlike(search_terms_regex)) \
      .limit(100) \
      .collect()


In [74]:
returned_rows

[Row(question_text='when did the us start fighting germany in ww2'),
 Row(question_text='who created the dothraki language on game of thrones'),
 Row(question_text='the man who set up the first spanish colony in the new world was'),
 Row(question_text='see no evil hear no evil speak no evil skulls meaning'),
 Row(question_text='the word theatre comes from greek and literally means seeing place'),
 Row(question_text='what is the meaning of kinetic molecular theory'),
 Row(question_text='who won the english football cup in 1949'),
 Row(question_text='who is going to get eliminated in bigg boss telugu'),
 Row(question_text='english colonies in north america established a form of blank based on elections'),
 Row(question_text='what is the longest english word in which no letter is repeated'),
 Row(question_text='what is the meaning of love me like you do in hindi'),
 Row(question_text='what is the meaning of llc in a company'),
 Row(question_text='took a pill in ibiza meaning of song'),
 R