# Pipeline 1

In [None]:
from pipeline import Falcon2Linker, SerialAnnotator, T5Converter
import time
from pprint import pprint

In [None]:
linker = Falcon2Linker()
annotator = SerialAnnotator()
converter = T5Converter()

In [None]:
utterance = "Who is the wife of Barack Obama"

In [None]:
linked = linker.link(utterance)
# rules = [1,2,3,4,5,8,9,10,12,13,14]
# linked = process_text_E_R(utterance, rules)
pprint("Linked")
pprint(linked)

pprint("Annotated")
annotated = annotator.annotate(**linked)
pprint(annotated)

pprint("Converted")
converted = converter.preprocess_inputs(**annotated)
pprint(converted)

In [None]:
def pipe(utterance, wikisparql):
    linked = linker.link(utterance)
    annotated = annotator.annotate(**linked)
    converted = converter.preprocess(**annotated, wikisparql=wikisparql)
    return linked, annotated, converted

In [None]:
def pipe_batch(utterances, wikisparqls):
  batched = []
  link_batch_start = time.time()
  linked = linker.link_batch(utterances)
  link_batch_end = time.time()
  annotator_time = 0
  converter_time = 0
  for i, single_linked in enumerate(linked):
    s = time.time()
    annotated = annotator.annotate(**single_linked)
    e = time.time()
    try:
      annotator_time += e - s
    except Exception as e:
      print("[Annotator Error]:", e)
      continue
    s = time.time()
    try:
      converted = converter.preprocess(**annotated, wikisparql=wikisparqls[i])
    except Exception as e:
      print("[Converter Error]:", e)
      continue
      
    e = time.time()
    converter_time += e - s
    batched.append([single_linked, annotated, converted])
  print("Link batch time:", link_batch_end - link_batch_start)
  print("Anno batch time:", annotator_time)
  print("Conv batch time:", converter_time)
  return batched

## Data

In [None]:
from pathlib import Path
import pandas as pd
import json

data_path = Path("..") / "t5-for-sparql" / "data" / "lcquad2" / "train.json"

In [None]:
with open(data_path) as f:
  data_dict = json.load(f)

for item in data_dict:
  paraphrased_question = item["paraphrased_question"]
  question = item["question"]
  item["new_question"] = paraphrased_question if len(paraphrased_question) > 2 else question

df = pd.DataFrame.from_dict(data_dict)
df.head()

In [None]:
responses = []
batch_qns = []
batch_ans = []
truncated_data = data_dict
total_len = len(truncated_data)
last = 0
for i, data in enumerate(truncated_data):
    print("Pipeline iter", i)
    question = data["question"]
    answer = data["sparql_wikidata"]
    # try:
    #     linked, annotated, converted = pipe(question, answer)
    # except Exception as e:
    #     print(e)
    #     continue
    # responses.append([linked, annotated, converted])
    if len(question) >= 2:
        batch_qns.append(question)
        batch_ans.append(answer)
    if ((i + 1) % 50) == 0 or i == total_len - 1:
        print("[Pipeline1]:", f"Linking {last}-{i}")
        try:
            for linked, annotated, converted in pipe_batch(batch_qns, batch_ans):
                responses.append([linked, annotated, converted])
            batch_qns = []
            batch_ans = []
            last = i
        except Exception as e:
            raise e
            # print(e)
            # continue
        with open(f"../t5-for-sparql/falcon_links/2/link_{i}.json", "w") as f:
            json.dump(responses, f, indent=2, separators=(',',':'))

In [None]:
# qns = ['What is the operating income for Qantas?', 'What is Mary Lou Rettons International Olympic Committee athlete ID.']
# l = linker.link_batch(qns)
# print(l)

In [None]:
with open("home.json", "w") as f:
  json.dump(responses, f, indent=2 ,separators=(',', ': '))