### Requirements: wiki-pages folder with Wikipedia data dump and train.jsonl file from the FEVER website

In [None]:
import numpy as np
import pandas as pd
import itertools
import re

In [None]:
df = pd.read_json("train.jsonl", lines=True)
df = df.drop(["id"], axis="columns")

In [None]:
for i, row in df.iterrows():
#     print(len(row.evidence))
    
    if(len(row.evidence[0]) != 1):
        df.loc[i, "skip"] = True
    else:
        df.loc[i, "skip"] = False

In [None]:
for i, row in df.iterrows():
    if(row.skip == True):
        p = row.evidence[0]
        df.at[i, "evidence"] = p
    else:
        p = []
        for e in row.evidence:
            p.append(e[0])
        df.at[i, "evidence"] = p

In [None]:
def simple_evidence(evi):
    p = []
    for e in evi:
        p.append(e[2:])
    
    return p


df["evi"] = df.evidence.apply(simple_evidence)
df = df.drop(["evidence", "skip"], axis="columns")

In [None]:
evidence_set = set()
evi_number_set = {}
for i, row in df.iterrows():
    if(row.label != "NOT ENOUGH INFO"):
        for e in row.evi:
            evidence_set.add(e[0])
            if e[0] not in evi_number_set:
                evi_number_set[f"{e[0]}"] = set([int(f"{e[1]}")])
            else:
                evi_number_set[f"{e[0]}"].add(int(f"{e[1]}"))

In [None]:
len(evidence_set)

In [None]:
out = df.to_json(orient='records', lines=True)

file = "simplified.jsonl"

with open(file, 'w') as f:
    f.write(out)

In [None]:
df = pd.read_json("simplified.jsonl", lines=True)

In [None]:
df

In [None]:
def convert_to_sentences(line):
    l = line.split("\n")[0:-1]
    sentences = []
    for i in l:
        k = i.split("\t")
        sent = re.sub(r'[^A-Za-z0-9 ]+', ' ', k[1: 2][0])
        sent = " ".join(sent.split())
        sentences.append(sent)
#         clear_output()
    return sentences

def match_start(data):
    
    start = [p[0] for p in data]
    s = []
    start_set = set(start)
    s = [e for e in evidence_set if e[0] in start_set]
    
    return set(s)

def remove_foreign_ids(line):
    if re.sub(r'[^A-Za-z0-9 ]+', '', line) == "":
        return ""
    return line

def remove_foreign_words(line):
    return re.sub(r'[^A-Za-z0-9 ]+', '', line)

In [None]:
for i in range(1, 110):
    page = ""
    if i < 10:
        page = f"wiki-00{i}"
    elif i < 100:
        page = f"wiki-0{i}"
    else:
        page = f"wiki-{i}"
    print(page)
    data = pd.read_json(f"wiki-pages/{page}.jsonl", lines=True)
    matching_ids = 0
    if i==1:
        matching_ids = match_start(data.id[1:])
    else:
        matching_ids = match_start(data.id)
    
    matching_data = data[data.id.isin(matching_ids)]
#     print(matching_data.id)
    data = data[~data.id.isin(matching_ids)]
    
    data["ids"] = data.id.apply(remove_foreign_ids)
    data = data[~(data.ids == "")]
    data.drop(["id"], axis="columns", inplace=True)
    data = data.rename(columns = {"ids": "id"})
    data = data.sample(n=110)
    
    data = pd.concat([matching_data, data], ignore_index=True)
    data["sentences"] = data.lines.apply(convert_to_sentences)
    data["text1"] = data.text.apply(remove_foreign_words)
    data = data.drop(["lines", "text"], axis="columns")
    data.rename(columns = {"text1": "text"}, inplace=True)
    
    out1 = data.to_json(orient='records', lines=True)
    file = "simplified_wiki.jsonl"
#     print(data)
#     break
    with open(file, 'a') as f:
        f.write(out1)

In [None]:
data = pd.read_json("simplified_wiki.jsonl", lines=True)
data.sort_values("id", inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data.text[0:3]

In [None]:
data.sentences[0:3]

In [None]:
s = set(data.id.unique())
len(s)

In [None]:
common = s.intersection(evidence_set)
len(common)

In [None]:
unmatched_evidence = evidence_set - common
available_evidence = evidence_set - unmatched_evidence
len(available_evidence)

In [None]:
unmatched_evidence

In [None]:
def populate_evidences(evi):
    s = []
    p = data.id.unique()
    for e in evi:
        if e[0] in p:
            row = data.loc[data.id == e[0]]
            s.append([e[0], e[1], row.sentences.values[0][e[1]]])
    return s

In [None]:
df["evidences"] = df.evi.apply(populate_evidences)

In [None]:
df.evidences

In [None]:
df1 = df[(df['evidences'].str.len() == 0) & (df['label'] != "NOT ENOUGH INFO")]
df1.label.unique()

In [None]:
len(df1.label.values)

In [None]:
populated = df[~((df['evidences'].str.len() == 0) & (df['label'] != "NOT ENOUGH INFO"))]

In [None]:
populated = populated.drop(["evi", "verifiable"], axis="columns", )
populated.reset_index(drop=True, inplace=True)

In [None]:
populated

In [None]:
file = "populated_samples.jsonl"
out = populated.to_json(orient='records', lines=True)
with open(file, 'w') as f:
    f.write(out)

In [None]:
## test
populated = pd.read_json("populated_samples.jsonl", lines=True)
populated

In [None]:
populated.evidences[0]