# Google natural questions (scraped)
This notebooks aims to answer the following research question: **Which natural questions do users make when making dictionary-based queries**

In [1]:
!pip3 install gsutil



In [3]:
from tools.datasets import *
import fnmatch
import os

In [4]:
googlenlq_dir = os.path.join(DATA_FOLDER, "GoogleNLQ")
os.makedirs(googlenlq_dir, exist_ok=True)
# Be wary! This will download the full dataset of 41 GBs.
#!gsutil -m cp -R gs://natural_questions/v1.0 $googlenlq_dir
for filename in os.listdir(googlenlq_dir):
    if fnmatch.fnmatch(filename, "*dev-all.jsonl"):
        googlenlq_filename = os.path.join(googlenlq_dir, filename)
print("Found googlenlq dataset at " + googlenlq_filename)

Found googlenlq dataset at data/GoogleNLQ/v1.0-simplified_nq-dev-all.jsonl


In [5]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [6]:
from pyspark.sql.functions import explode, expr

In [7]:
nlq_df = spark.read.json(googlenlq_filename)

In [8]:
nlq_df.printSchema()

root
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotation_id: decimal(20,0) (nullable = true)
 |    |    |-- long_answer: struct (nullable = true)
 |    |    |    |-- candidate_index: long (nullable = true)
 |    |    |    |-- end_byte: long (nullable = true)
 |    |    |    |-- end_token: long (nullable = true)
 |    |    |    |-- start_byte: long (nullable = true)
 |    |    |    |-- start_token: long (nullable = true)
 |    |    |-- short_answers: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- end_byte: long (nullable = true)
 |    |    |    |    |-- end_token: long (nullable = true)
 |    |    |    |    |-- start_byte: long (nullable = true)
 |    |    |    |    |-- start_token: long (nullable = true)
 |    |    |-- yes_no_answer: string (nullable = true)
 |-- document_html: string (nullable = true)
 |-- document_title: string (nullable = true)
 |-- document_to

In [9]:
nlq_df.createOrReplaceTempView("NLQ")

res = spark.sql("SELECT document_title, question_text from NLQ LIMIT 100")

In [10]:
res.show(truncate=False)

+---------------------------------------------+-------------------------------------------------------------------------+
|document_title                               |question_text                                                            |
+---------------------------------------------+-------------------------------------------------------------------------+
|Therefore sign                               |what do the 3 dots mean in math                                          |
|Watchman (law enforcement)                   |when was the writ watch invented by who                                  |
|Photograph (Ringo Starr song)                |who wrote the song photograph by ringo starr                             |
|Super Bowl 50 halftime show                  |who is playing the halftime show at super bowl 2016                      |
|Matt Lanter                                  |star wars the clone wars anakin voice actor                              |
|A Whiter Shade of Pale 

In [11]:
#nlq_df.select(explode("annotations")).select("col.*").select("long_answer.candidate_index").limit(10).collect()
nlq_df.select("question_text", explode("long_answer_candidates")) \
      .select("question_text", "col.start_token", "col.end_token") \
      .limit(10) \
      .collect()
      

[Row(question_text='what do the 3 dots mean in math', start_token=14, end_token=808),
 Row(question_text='what do the 3 dots mean in math', start_token=15, end_token=20),
 Row(question_text='what do the 3 dots mean in math', start_token=20, end_token=26),
 Row(question_text='what do the 3 dots mean in math', start_token=26, end_token=197),
 Row(question_text='what do the 3 dots mean in math', start_token=28, end_token=195),
 Row(question_text='what do the 3 dots mean in math', start_token=29, end_token=34),
 Row(question_text='what do the 3 dots mean in math', start_token=34, end_token=43),
 Row(question_text='what do the 3 dots mean in math', start_token=43, end_token=58),
 Row(question_text='what do the 3 dots mean in math', start_token=58, end_token=66),
 Row(question_text='what do the 3 dots mean in math', start_token=66, end_token=76)]

In [12]:
ris = nlq_df.select("question_text").limit(1)

In [13]:
r = ris.collect()

In [14]:
r[0].asDict()["question_text"]

'what do the 3 dots mean in math'

In [15]:
sampled_questions = nlq_df.select("question_text") \
                          .filter(nlq_df.question_text.contains("how do you say")) \
                          .sample(withReplacement=False, fraction=1.).collect()

In [16]:
print(sampled_questions)

[Row(question_text='how do you say evil eye in greek'), Row(question_text='how do you say zest for life in french')]


In [18]:
from tools.datasets import *

from SPARQLWrapper import SPARQLWrapper, JSON
wikidata_sparql.sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
wikidata_sparql.sparql.setReturnFormat(JSON)

language_list = wikidata_sparql.run_query("""
SELECT DISTINCT ?lang

WHERE
{
    ?langEntity wdt:P31 wd:Q34770;
                  wdt:P1098 ?num.
    ?langEntity rdfs:label ?lang.
    FILTER(LANG(?lang) = "en").
}

ORDER BY DESC(?num)
LIMIT 30
""")

In [23]:
most_common_languages = set(language_list["lang.value"].str.lower().array)

def contains(column):
    return any(word in most_common_languages for word in column.question_text.split())

nlq_df.select("question_text") \
      .rdd \
      .filter(contains) \
      .collect()

[Row(question_text='at one point during the first world war the french prime minister'),
 Row(question_text='who continues to exist according to the zhou chinese'),
 Row(question_text='which is not true about the chinese horse'),
 Row(question_text='what was the ancient chinese umbrella used for'),
 Row(question_text="what's the difference between peanuts and spanish peanuts"),
 Row(question_text='all tables are to be numbered using arabic numerals'),
 Row(question_text='why does one ok rock sing in english'),
 Row(question_text='who were the major leaders of the french and indian war'),
 Row(question_text='list of all spices in english and hindi'),
 Row(question_text='what document produced during the french revolution was ignored by the committee of public safety'),
 Row(question_text='what is the meaning of safiya in urdu'),
 Row(question_text='who translated the play neel darpan into english'),
 Row(question_text='what is the means of strange in hindi'),
 Row(question_text='english