# Pipeline 1

In [1]:
from pipeline import Falcon2Linker, SerialAnnotator, T5Converter
import time
from pprint import pprint

main_virtuoso_noreranking


In [2]:
linker = Falcon2Linker()
annotator = SerialAnnotator()
converter = T5Converter()

In [3]:
utterance = "Who is the wife of Barack Obama"

In [4]:
linked = linker.link(utterance)
# rules = [1,2,3,4,5,8,9,10,12,13,14]
# linked = process_text_E_R(utterance, rules)
pprint("Linked")
pprint(linked)

pprint("Annotated")
annotated = annotator.annotate(**linked)
pprint(annotated)

pprint("Converted")
converted = converter.preprocess_inputs(**annotated)
pprint(converted)

recieved utterance
0 : ['Who is the wife of Barack Obama']
['Who is the wife of Barack Obama', [['<http://www.wikidata.org/entity/P26>', 'wife'], ['<http://www.wikidata.org/entity/P2848>', 'wife'], ['<http://www.wikidata.org/entity/P140>', 'wife'], ['<http://www.wikidata.org/entity/P451>', 'wife'], ['<http://www.wikidata.org/entity/P600>', 'wife']], [['<http://www.wikidata.org/entity/Q76>', 'Barack obama'], ['<http://www.wikidata.org/entity/Q649593>', 'Barack obama'], ['<http://www.wikidata.org/entity/Q4808526>', 'Barack obama'], ['<http://www.wikidata.org/entity/Q4858106>', 'Barack obama'], ['<http://www.wikidata.org/entity/Q643049>', 'Barack obama']], 0, 0, 0, 0]
'Linked'
{'ents': [{'id': 'Q76',
           'prefix': 'wd:',
           'uri': 'http://www.wikidata.org/entity/Q76'},
          {'id': 'Q649593',
           'prefix': 'wd:',
           'uri': 'http://www.wikidata.org/entity/Q649593'},
          {'id': 'Q4808526',
           'prefix': 'wd:',
           'uri': 'http://www.wiki

In [4]:
def pipe(utterance, wikisparql):
    linked = linker.link(utterance)
    annotated = annotator.annotate(**linked)
    converted = converter.preprocess(**annotated, wikisparql=wikisparql)
    return linked, annotated, converted

In [5]:
def pipe_batch(utterances, wikisparqls):
  batched = []
  link_batch_start = time.time()
  linked = linker.link_batch(utterances)
  link_batch_end = time.time()
  s = time.time()
  annotated = annotator.batch_annotate(linked)
  e = time.time()
  assert len(linked) == len(annotated)
  annotator_time = e - s
  converter_time = 0
  for i, single_annotated in enumerate(annotated):
    # annotated = annotator.annotate(**single_linked)
    # try:
    #   annotator_time += e - s
    # except Exception as e:
    #   print("[Annotator Error]:", e)
    #   continue
    s = time.time()
    try:
      converted = converter.preprocess(**single_annotated, wikisparql=wikisparqls[i])
    except Exception as e:
      print("[Converter Error]:", e)
      continue
      
    e = time.time()
    converter_time += e - s
    batched.append([linked[i], annotated[i], converted])
  print("Link batch time:", link_batch_end - link_batch_start)
  print("Anno batch time:", annotator_time)
  print("Conv batch time:", converter_time)
  return batched

## Data

In [6]:
from pathlib import Path
import pandas as pd
import json

data_path = Path("..") / "t5-for-sparql" / "data" / "lcquad2" / "train.json"

In [22]:
from pprint import pprint
with open(data_path) as f:
  data_dict = json.load(f)

count = 0

for item in data_dict:
  paraphrased_question = item["paraphrased_question"]
  question = item["question"]
  if paraphrased_question == None:
    raise Exception
  if question == None:
    count += 1
    # print(len(paraphrased_question) > 2)
    # pprint(item)
  if item["sparql_wikidata"] == None:
    raise Exception
  

  if len(paraphrased_question) > 2000:
    paraphrased_question = question
  item["new_question"] = paraphrased_question if len(paraphrased_question) > 2 and paraphrased_question != [] else question

print("Missing question count:", count)

df = pd.DataFrame.from_dict(data_dict)
df = df.dropna(axis=0, subset=['new_question'])
df = df[~df.question.isin(['n/a', None, 'none', 'None'])]
n_entries = len(df)
print("Number of rows:", n_entries)

Missing question count: 56
Number of rows: 24067


In [27]:
assert(len(df[df.question.isna()]) == 0)

Unnamed: 0,NNQT_question,uid,subgraph,template_index,question,sparql_wikidata,sparql_dbpedia18,template,answer,template_id,paraphrased_question,new_question


In [11]:
df.iloc[35:45,:]

Unnamed: 0,NNQT_question,uid,subgraph,template_index,question,sparql_wikidata,sparql_dbpedia18,template,answer,template_id,paraphrased_question,new_question
35,What is the {human} for {student of} of {Ivan ...,20145,simple question right,491,Who is the person that was a student of Ivan P...,select distinct ?obj where { wd:Q42985 wdt:P1...,select distinct ?obj where { ?statement <http:...,<S P ?O ; ?O instanceOf Type>,[],1,Who is the person Ivan Pavlov's student?,Who is the person Ivan Pavlov's student?
36,Does the {clock speed} of the {Watara Supervis...,18264,boolean with filter,282,is the clock speed of the Watara Supervision s...,ASK WHERE { wd:Q732683 wdt:P2149 ?obj filter(?...,ASK { ?statement1 <http://www.w3.org/1999/02/2...,ASK ?sbj ?pred ?obj filter ?obj = num,[],3,is the clock speed of the Watara Supervision l...,is the clock speed of the Watara Supervision l...
37,What is the {sovereign state} for {diplomatic ...,19896,simple question right,242,What country is the current leader of the Afri...,select distinct ?obj where { wd:Q7159 wdt:P53...,select distinct ?obj where { ?statement <http:...,<S P ?O ; ?O instanceOf Type>,[],1,What country leads the African Union?,What country leads the African Union?
38,What is {works for} of {accused} of {Mariposa ...,16037,left-subgraph,22,Who works for the accused Mariposa Folk Festiv...,SELECT ?answer WHERE { wd:Q54554872 wdt:P710 ?...,SELECT ?answer WHERE { ?statement1 <http://www...,C RCD xD . xD RDE ?E,[],5,The accused Mariposa Folk Festival in 1974 emp...,The accused Mariposa Folk Festival in 1974 emp...
39,What is the {neighborhood} for {shares border ...,22651,simple question right,3712,,select distinct ?obj where { wd:Q65 wdt:P47 ?...,select distinct ?obj where { ?statement <http:...,<S P ?O ; ?O instanceOf Type>,[],1,N / A N / A,N / A N / A
40,What is {sister city} of {born in} of {Zakhar ...,15426,left-subgraph,29905,What sister city was born in of Zakhar Oskotsky?,SELECT ?answer WHERE { wd:Q4338004 wdt:P19 ?X ...,SELECT ?answer WHERE { ?statement1 <http://www...,C RCD xD . xD RDE ?E,[],5,Which sister city in Zakhar Oskotsky was born?,Which sister city in Zakhar Oskotsky was born?
41,Give me {fantastique genre} that starts with {...,24275,string matching simple contains word,1045,Tell me the name of a fantastique genre that s...,SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj w...,SELECT DISTINCT ?sbj ?sbj_label { ?statement1 ...,<?S P O ; ?S instanceOf Type ; starts with ch...,[],2,Let me know the title of a fantastique sort th...,Let me know the title of a fantastique sort th...
42,What is the {medal} for {award received} of {A...,29959,simple question right,1550,What is the medal Angela Lansbury recieved?,select distinct ?obj where { wd:Q206856 wdt:P...,select distinct ?obj where { ?statement <http:...,<S P ?O ; ?O instanceOf Type>,[],1,[],What is the medal Angela Lansbury recieved?
43,What is {musical score by} of {Missa Solemnis}...,12032,right-subgraph,178,What is the musical score by Missa Solemnis th...,SELECT ?answer WHERE { wd:Q723790 wdt:P86 ?ans...,SELECT ?answer WHERE { ?statement1 <http://www...,E REF ?F . ?F RFG G,[],1,What is the musical rating by means of Missa S...,What is the musical rating by means of Missa S...
44,what is the {point in time} for {Robert De Nir...,6819,statement_property,3264,When did Robert De Niro reside in Marbletown?,SELECT ?value WHERE { wd:Q36949 p:P551 ?s . ?s...,select distinct ?value where {\n?statement <h...,(E pred ?Obj ) prop value,[],statement_property_1,When did Robert De Nirolive in Marbletown?,When did Robert De Nirolive in Marbletown?


In [32]:
for data in df[0:5].iterrows():
    print(data[1].new_question)

What is Delta Air Line's periodical literature mouthpiece?
What is the name of Ranavalona I's husband's child?
Are Jeff Bridges and Lane Chandler both photographers?
What range are the papers at the Monique Genonceaux about?
Which is the operating income for Qantas?


In [None]:
responses = []
batch_qns = []
batch_ans = []
truncated_data = df[:]
total_len = len(truncated_data)
last = 0
cutoff = 0
for i, data in enumerate(truncated_data.iterrows()):
    data = data[1]
    if i < cutoff:
        continue
    # print("Pipeline iter", i)
    question = data["new_question"]
    answer = data["sparql_wikidata"]
    # try
    #     linked, annotated, converted = pipe(question, answer)
    # except Exception as e:
    #     print(e)
    #     continue
    # responses.append([linked, annotated, converted])
    if (question and answer) and len(question) >= 2:
        batch_qns.append(question)
        batch_ans.append(answer)
    if ((i + 1) % 50) == 0 or i == total_len - 1:
        print("[Pipeline1]:", f"Linking {last}-{i}")
        try:
            for linked, annotated, converted in pipe_batch(batch_qns, batch_ans):
                responses.append([linked, annotated, converted])
            batch_qns = []
            batch_ans = []
            last = i
        except Exception as e:
            raise e
            # print(e)
            # continue
        with open(f"../t5-for-sparql/falcon_links/2/link_{i}.json", "w") as f:
            json.dump(responses, f, indent=2, separators=(',',':'))

[reRank_relations]: wHAT PROFESSIONAL DEGREE  CONTAINS THE WORD MASTER IN THE NAME 
SPARQL Requests Made: 277 
SPARQL Requests Total Time: 200.77274179458618 
SPARQL Avg Time per Qn: 0.7248113422187227


In [None]:
# qns = ['What is the operating income for Qantas?', 'What is Mary Lou Rettons International Olympic Committee athlete ID.']
# l = linker.link_batch(qns)
# print(l)

In [None]:
with open("home.json", "w") as f:
  json.dump(responses, f, indent=2 ,separators=(',', ': '))