In [None]:
import pandas as pd 
from ast import literal_eval
import json 
from string import Template
import random 
import tqdm.notebook as tqdm 

sent_templates = json.loads(open("src/resources/sentence_templates.json").read())
kb = pd.read_csv("data/generated_knowledge/all_extracted_knowledge.csv")
kb = kb.query("causal_system != 'error' or knowledge != 'error' ")
kb["num_relations"] = kb["knowledge"].apply(lambda x: len(literal_eval(x)))
kb = kb.query("num_relations > 0")

In [None]:
# Linearize triples to generate a description
updated_rows = []

for i, row in tqdm.tqdm(kb.iterrows(), total=len(kb)):

    row = row.to_dict()

    triples = literal_eval(row["knowledge"])
    sents = []
    for triple in triples:
        triple = [k.strip() for k in triple.split(";")]
        
        if triple[1] not in sent_templates:
            continue

        template = Template(random.choice(sent_templates[triple[1]]))
        generated_sent = template.substitute({"head":triple[0], "tail": triple[2] })
        sents.append(generated_sent)
    desc = f"The causal system of {row['causal_system']}. {' '.join(sents)}"
    
    row["paragraph"] = desc
    updated_rows.append(row)    

df = pd.DataFrame(updated_rows)

In [None]:
# Generate MC questions
all_facts = []
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    triples = literal_eval(row["knowledge"])
    for triple in triples:
        triple = [k.strip() for k in triple.split(";")]
        try:
            all_facts.append({"head": triple[0], "tail": triple[2], "relation": triple[1]})
        except:
            continue

facts = pd.DataFrame(all_facts)

In [None]:
facts = facts.drop_duplicates()
entities = list(set(facts["head"].values) | set(facts["tail"].values))
len(entities)

In [None]:
import json 
import random 
from string import Template

question_templates = json.loads(open("src/resources/synthetic_qa_templates.json").read())

synthetic_set = []
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):

    triples = literal_eval(row["knowledge"])
    for triple in triples:
        try:
            triple = [k.strip() for k in triple.split(";")]
            head, relation, tail = triple[0], triple[1], triple[2]
        except:
            continue

        if triple[1] not in question_templates:
            continue

        mask_entity = random.choice(["head", "tail"])
        answer_options = {"head": head, "tail": tail}
        question_template = Template(random.choice(question_templates[triple[1]][mask_entity]))

        question = question_template.substitute({"head": head, "tail": tail})
        options = random.sample(set(entities).difference(answer_options[mask_entity]), 3)
        masked_paragraph = row["paragraph"].replace(answer_options[mask_entity], f"[MASK]")

        correct_answer = random.choice(["a", "b", "c", "d"])
        if correct_answer == "a":
            output = f"a) {answer_options[mask_entity]}"
            label = "a"
            options = f"a) {answer_options[mask_entity]} b) {options[0]} c) {options[1]} d) {options[2]}"
        elif correct_answer == "b":
            output = f"b) {answer_options[mask_entity]}"
            label = "b"
            options = f"a) {options[0]} b) {answer_options[mask_entity]} c) {options[1]} d) {options[2]}"
        elif correct_answer == "c":
            output = f"c) {answer_options[mask_entity]}"
            label = "c"
            options = f"a) {options[0]} b) {options[1]} c) {answer_options[mask_entity]} d) {options[2]}"
        elif correct_answer == "d":
            output = f"d) {answer_options[mask_entity]}"
            label = "d"
            options = f"a) {options[0]} b) {options[1]} c) {options[2]} d) {answer_options[mask_entity]}"

        input = f"question: {question} options: {options} context: {masked_paragraph}"

        synthetic_set.append(
            {
                "uid": row['uid'],
                "input": input,
                "output": output,
                "label": label,
                "question": question,
                "options": options,
            }
        )


In [None]:
qa_df = pd.DataFrame(synthetic_set)

In [None]:
for i, r in qa_df.sample(5).iterrows():
    print(r["input"])
    print(r["output"])
    print(r["label"])
    print("----------")

In [None]:
qa_df.to_csv("data/generated_knowledge/synthetic_qa.csv", index=False)

In [None]:
import pandas as pd 
df = pd.read_csv("data/generated_knowledge/synthetic_qa.csv")

In [None]:
df

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds  = ds.train_test_split(test_size=0.10, seed=42, shuffle=True)
ds.save_to_disk("data/pretraining/synthetic_qa")

# Prepare external for synthetic qa

In [7]:
import pandas as pd 
import datasets 

sent_templates = {
    "cause-effect": [
        "$head can lead to $tail.",
        "sometimes $head can result in $tail.",
        "$head may cause $tail.",
        "$tail can sometimes be a consequence of $head.",
        "due to $head, $tail can occur."
    ]
}

data = datasets.load_from_disk("data/pretraining/external_kb")

train = data["train"].to_pandas()
test = data["test"].to_pandas()


df = pd.concat([train, test]).dropna().drop_duplicates()
entities = list(set(df["head"].tolist() + df["tail"].tolist()))

In [9]:
import random 
import tqdm.notebook as tqdm

sqa_input, sqa_label, sqa_output = [], [], []

for i,row in tqdm.tqdm(df.iterrows(), total=len(df)):
    input = row["clean_input"]
    
    concepts = [row["head"], row["tail"]]
    concept_candidate = random.choice(concepts)
    
    valid_entites = list(set(entities).difference(concepts))

    options = random.sample(valid_entites, 3)
    correct_answer = random.choice(["a", "b", "c", "d"])

    if correct_answer == "a":
        output = f"a) {concept_candidate}"
        label = "a)"
        options = f"a) {concept_candidate} b) {options[0]} c) {options[1]} d) {options[2]}"
    elif correct_answer == "b":
        output = f"b) {concept_candidate}"
        label = "b)"
        options = f"a) {options[0]} b) {concept_candidate} c) {options[1]} d) {options[2]}"
    elif correct_answer == "c":
        output = f"c) {concept_candidate}"
        label = "c)"
        options = f"a) {options[0]} b) {options[1]} c) {concept_candidate} d) {options[2]}"
    elif correct_answer == "d":
        output = f"d) {concept_candidate}"
        label = "d)"
        options = f"a) {options[0]} b) {options[1]} c) {options[2]} d) {concept_candidate}"
    
    
    question_text = input.replace(concept_candidate, "[MASKED]")
    if concept_candidate == row["head"]:
        question_text += " What is the most plausible cause?"
    else:
        question_text += " What is the most plausible effect?"
        
    
    final_input = f"question: {question_text} options: {options}"
    
    sqa_input.append(input)
    sqa_label.append(label)
    sqa_output.append(output)

  0%|          | 0/208081 [00:00<?, ?it/s]

In [10]:
df["sqa_input"] = sqa_input
df["sqa_label"] = sqa_label
df["sqa_output"] = sqa_output

In [11]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds  = ds.train_test_split(test_size=0.10, seed=42, shuffle=True)

In [15]:
ds = ds.remove_columns("Unnamed: 0")

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['head', 'relation', 'tail', 'source', 'sentence', 'clean_input', 'mlm_input', 'mlm_label', 'concept_input', 'concept_label', 'sqa_input', 'sqa_label', 'sqa_output', '__index_level_0__'],
        num_rows: 187272
    })
    test: Dataset({
        features: ['head', 'relation', 'tail', 'source', 'sentence', 'clean_input', 'mlm_input', 'mlm_label', 'concept_input', 'concept_label', 'sqa_input', 'sqa_label', 'sqa_output', '__index_level_0__'],
        num_rows: 20809
    })
})

In [None]:
ds.save_to_disk("data/pretraining/external_kb")