In [5]:
!tail -2 /local/fever-common/data/fever-data/train.jsonl

{"id": 13114, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "J. R. R. Tolkien created Gimli.", "evidence": [[[28359, 34669, "Gimli_-LRB-Middle-earth-RRB-", 0]], [[28359, 34670, "Gimli_-LRB-Middle-earth-RRB-", 1]]]}
{"id": 152180, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Susan Sarandon is an award winner.", "evidence": [[[176133, 189101, "Susan_Sarandon", 1]], [[176133, 189102, "Susan_Sarandon", 2]], [[176133, 189103, "Susan_Sarandon", 8]]]}


In [26]:
!pip install -r requirements.txt

Collecting allennlp
  Downloading allennlp-2.5.0-py3-none-any.whl (681 kB)
[K     |████████████████████████████████| 681 kB 5.3 MB/s eta 0:00:01
[?25hCollecting fever-scorer
  Downloading fever-scorer-2.0.39.tar.gz (3.9 kB)
Collecting fever-drqa
  Downloading fever-drqa-1.0.13.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s eta 0:00:011
[?25hCollecting tensorboardX>=1.2
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 10.4 MB/s eta 0:00:01
Collecting spacy<3.1,>=2.1.0
  Downloading spacy-3.0.6-cp38-cp38-manylinux2014_x86_64.whl (13.0 MB)
[K     |████████████████████████████████| 13.0 MB 11.0 MB/s eta 0:00:01    |████████████████████▎           | 8.2 MB 11.0 MB/s eta 0:00:01
[?25hCollecting overrides==3.1.0
  Downloading overrides-3.1.0.tar.gz (11 kB)
Collecting jsonnet>=0.10.0
  Downloading jsonnet-0.17.0.tar.gz (259 kB)
[K     |████████████████████████████████| 259 kB 4.8 MB/s eta 0:00:01
Coll

In [6]:

class LabelSchema:
    def __init__(self,labels):
        self.labels = {self.preprocess(val):idx for idx,val in enumerate(labels)}
        self.idx = {idx:self.preprocess(val) for idx,val in enumerate(labels)}

    def get_id(self,label):
        if self.preprocess(label) in self.labels:
            return self.labels[self.preprocess(label)]
        return None

    def preprocess(self,item):
        return item.lower()

class FEVERLabelSchema(LabelSchema):
    def __init__(self):
        super().__init__(["supports", "refutes", "not enough info"])

def nltk_tokenizer(text):
    return " ".join(word_tokenize(text))

class training_line_formatter():
        
    def format(self, lines):
        formatted = []
        for line in tqdm(lines):
            fl = self.format_line(line)
            if fl is not None:
                if isinstance(fl,list):
                    formatted.extend(fl)
                else:
                    formatted.append(fl)
        return formatted

    def format_line(self, line):
        label_schema = FEVERLabelSchema()
        # get the label, i.e. SUPPORTS etc.
        annotation = line["label"]
        if annotation is None:
            annotation = line["verifiable"]
        pages = []
        # did we get the closest sentences to the claim text? is this the sentence or the line number from the doc text?
        if 'predicted_sentences' in line:
            pages.extend([(ev[0], ev[1]) for ev in line["predicted_sentences"]])
        elif 'predicted_pages' in line:
            pages.extend([(ev[0], -1) for ev in line["predicted_pages"]])
        else:
            # these are the human annotated evidence available in the original training file
            for evidence_group in line["evidence"]:
                pages.extend([(ev[2], ev[3]) for ev in evidence_group])
        return {"claim": line["claim"], "evidence": pages, "label": label_schema.get_id(annotation),
                "label_text": annotation}

In [7]:
class Reader:
    def __init__(self,encoding="utf-8"):
        self.enc = encoding

    def read(self,file):
        with open(file,"r",encoding = self.enc) as f:
            return self.process(f)

    def process(self,f):
        pass

class JSONLineReader(Reader):
    def process(self,fp):
        data = []
        for line in tqdm(fp.readlines()):
            data.append(json.loads(line.strip()))
        return data

In [8]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'train'
working_dir = 'working/data/'
k = 5
training_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
data = jlr.read(training_data_file)

100%|██████████| 145449/145449 [00:02<00:00, 70226.60it/s]


In [9]:
formatter = training_line_formatter()
formatted_train_data = formatter.format(data)

100%|██████████| 145449/145449 [00:01<00:00, 102817.91it/s]


In [11]:
formatted_train_data[:2]

[{'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
  'evidence': [('Nikolaj_Coster-Waldau', 7), ('Fox_Broadcasting_Company', 0)],
  'label': 0,
  'label_text': 'SUPPORTS'},
 {'claim': 'Roman Atwood is a content creator.',
  'evidence': [('Roman_Atwood', 1), ('Roman_Atwood', 3)],
  'label': 0,
  'label_text': 'SUPPORTS'}]

In [151]:
[d for d in formatted_train_data if d['label_text'] == 'NOT ENOUGH INFO'][:5]

[{'claim': 'System of a Down briefly disbanded in limbo.',
  'evidence': [('In_Limbo', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Beautiful reached number two on the Billboard Hot 100 in 2003.',
  'evidence': [('Ricky_Martin_singles_discography', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Neal Schon was named in 1954.',
  'evidence': [('Double_Eclipse', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'John Wick: Chapter 2 was theatrically released in the Oregon.',
  'evidence': [('John_Wick_-LRB-disambiguation-RRB-', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Afghanistan is the source of the Kushan dynasty.',
  'evidence': [('Kanishka_-LRB-name-RRB-', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'}]

#### RANDOMS, NEARESTP approach
TODO: Need to sample sentences for the 'not enough info' class, either sample them randomly from the nearest matching document, or sample random sentences from the Wiki corpus.

In [12]:
import json
from tqdm import tqdm
jlr = JSONLineReader()
split = 'paper_dev'
working_dir = 'working/data/'
k = 5
dev_data_file = working_dir + "training/{0}.ns.pages.p{1}.jsonl".format(split, k)
dev_data = jlr.read(dev_data_file)

formatter = training_line_formatter()
formatted_dev_data = formatter.format(dev_data)

dev_data_formatted = []
dev_data_formatted.extend(filter(lambda record: record is not None, formatted_dev_data))
dev_data_formatted[:2]

100%|██████████| 9999/9999 [00:00<00:00, 84545.60it/s]
100%|██████████| 9999/9999 [00:00<00:00, 98831.02it/s]


[{'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'evidence': [('Colin_Kaepernick', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Tilda Swinton is a vegan.',
  'evidence': [('Swinton_-LRB-surname-RRB-', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'}]

#### Build the tokenizer

In [177]:
import random
import os

class SimpleRandom():
    instance = None

    def __init__(self,seed):
        self.seed = seed
        self.random = random.Random(seed)

    def next_rand(self,a,b):
        return self.random.randint(a,b)

    @staticmethod
    def get_instance():
        if SimpleRandom.instance is None:
            SimpleRandom.instance = SimpleRandom(SimpleRandom.get_seed())
        return SimpleRandom.instance

    @staticmethod
    def get_seed():
        return int(os.getenv("RANDOM_SEED", 12459))

    @staticmethod
    def set_seeds():

        torch.manual_seed(SimpleRandom.get_seed())
        if gpu():
            torch.cuda.manual_seed_all(SimpleRandom.get_seed())
        np.random.seed(SimpleRandom.get_seed())
        random.seed(SimpleRandom.get_seed())

In [163]:
ename = "evidence"
def claims(data):
    return [datum["claim"] for datum in data]
def body_ids(data):
    return [[d[0] for d in datum[ename] ] for datum in data]
def flatten(l):
    return [item for sublist in l for item in sublist]
def bodies(data):
    return [database.get_doc_text(id) for id in set(flatten(body_ids(data)))]

def texts(data):
    return [" ".join(set(instance)) for instance in body_lines(data)]

def body_lines(data):
    return [[get_doc_line(d[0],d[1]) for d in datum[ename] ] for datum in data]

def get_doc_line(doc,line):
    lines = database.get_doc_lines(doc)
    ### if this is from annotated evidences
    if line > -1:
        return lines.split("\n")[line].split("\t")[1]
    else: ### if this is from not enough info evidences, NearestP method, to sample "a" sentence
        non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
        return non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]    

In [166]:
[d for d in formatted_train_data if d['label_text'] == 'NOT ENOUGH INFO'][:2]

[{'claim': 'System of a Down briefly disbanded in limbo.',
  'evidence': [('In_Limbo', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'},
 {'claim': 'Beautiful reached number two on the Billboard Hot 100 in 2003.',
  'evidence': [('Ricky_Martin_singles_discography', -1)],
  'label': 2,
  'label_text': 'NOT ENOUGH INFO'}]

In [159]:
cl = formatted_train_data[0]['claim']
ev = formatted_train_data[0]['evidence']
[(d[0],d[1]) for d in ev]

[('Nikolaj_Coster-Waldau', 7), ('Fox_Broadcasting_Company', 0)]

In [160]:
get_doc_line('Nikolaj_Coster-Waldau', 7)

'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .'

In [173]:
lines = database.get_doc_lines('Ricky_Martin_singles_discography')
lines

"0\tPuerto Rican singer Ricky Martin has released seventy-nine Spanish and English-language singles .\tRicky Martin\tRicky Martin (1999 album)\tsingles\tsingle (music)\n1\tIn 1984 , thirteen-year-old Martin became a member of the Puerto Rican boy band Menudo .\tMenudo\tMenudo (band)\n2\tAfter recording eleven albums with the group , he left Menudo in 1989 , hoping to rest and evaluate his career path .\tMenudo\tMenudo (band)\n3\tIn 1990 , he was signed to Sony Discos , the Sony Music Entertainment 's Latin imprint .\tSony Discos\tSony Music Latin\tSony Music Entertainment\tSony Music Entertainment\n4\tMartin released his debut solo album , the Spanish-language Ricky Martin , in November 1991 .\tRicky Martin\tRicky Martin (1999 album)\n5\tIt included hit singles : `` Fuego Contra Fuego '' , `` El Amor de Mi Vida '' and `` Vuelo '' .\tsingles\tsingle (music)\tFuego Contra Fuego\tFuego Contra Fuego\tEl Amor de Mi Vida\tEl Amor de Mi Vida (song)\tVuelo\tVuelo (song)\tVida\tVida (Ricky Mart

In [180]:
non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
len(non_empty_lines)

76

In [182]:
non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]

"`` Vida '' , recorded for the 2014 FIFA World Cup , reached top ten in Spain and Mexico , and also on the US Hot Latin Songs ."

In [183]:
[get_doc_line(d[0],d[1]) for d in formatted_train_data[0]["evidence"] ]

['He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot .',
 'The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .']

In [28]:
import unicodedata
import re
def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess(w):
        w = unicode_to_ascii(w.lower().strip())
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.strip()
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '[START] ' + w + ' [END]'
        return w

In [29]:
from drqa.retriever import DocDB, utils
class FeverDocDB(DocDB):

    def __init__(self,path=None):
        super().__init__(path)

    def get_doc_lines(self, doc_id):
        """Fetch the raw text of the doc for 'doc_id'."""
        cursor = self.connection.cursor()
        cursor.execute(
            "SELECT lines FROM documents WHERE id = ?",
            (utils.normalize(doc_id),)
        )
        result = cursor.fetchone()
        cursor.close()
        return result if result is None else result[0]

    def get_non_empty_doc_ids(self):
        """Fetch all ids of docs stored in the db."""
        cursor = self.connection.cursor()
        cursor.execute("SELECT id FROM documents WHERE length(trim(text)) > 0")
        results = [r[0] for r in cursor.fetchall()]
        cursor.close()
        return results
database_path = '/local/fever-common/data/fever/fever.db'
database = FeverDocDB(database_path)

In [184]:
for data in formatted_train_data[:2]:
    claim = preprocess(data["claim"])
    lines = [get_doc_line(d[0],d[1]) for d in data["evidence"] ]
#     body_ids = [e[0] for e in data["evidence"]]
#     bodies = [database.get_doc_text(id) for id in set(body_ids)]
#     parts = [claim, " ".join(bodies)]
    parts = [claim, " ".join(lines)]
    print (" ".join(parts))

[START] nikolaj coster waldau worked with the fox broadcasting company . [END] He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .
[START] roman atwood is a content creator . [END] He is best known for his vlogs , where he posts updates about his life on a daily basis . He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .


In [153]:
import numpy as np
import tensorflow as tf
def get_data_generator():
    for data in formatted_train_data:
        claim = preprocess(data["claim"])
        body_ids = [e[0] for e in data["evidence"]]
        bodies = [database.get_doc_text(id) for id in set(body_ids)]
        parts = [claim, " ".join(bodies)]
        yield " ".join(parts)
        
def get_dataset():
    generator = lambda: get_data_generator()
    return tf.data.Dataset.from_generator(
            generator, output_signature=(
            tf.TensorSpec(shape=(), dtype=tf.string)))

In [87]:
for d in get_dataset().take(1):
    print(d)

tf.Tensor(b"[START] nikolaj coster waldau worked with the fox broadcasting company . [END] Nikolaj Coster-Waldau -LRB- -LSB- ne\xc9\xa1\xcc\x8aola\xc9\xaa\xcc\xaf k\xca\xb0\xca\x8csd\xcc\xa5\xc9\x90 \xcb\x88\xca\x8bald\xcc\xa5\xc9\x91\xca\x8a\xcc\xaf -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter . He graduated from Danish National School of Theatre in Copenhagen in 1993 . Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- . Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .   In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon . He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in t

In [85]:
ds = get_dataset()
batch_size=32
ds = ds.shuffle(3200).batch(batch_size, drop_remainder=True)

In [34]:
for d in ds.take(1):
    print(d)

tf.Tensor(
[b"[START] liv tyler modeled as an infant . [END] Liv is a Norwegian mostly female given name derived from the Old Norse `` hl\xc3\xadf '' , which means `` shelter '' or `` protection '' ; in modern Norwegian , Swedish , and Danish it is also homophonous with the word `` liv '' meaning `` life . ''   In Norse mythology , L\xc3\xadf and L\xc3\xadf\xc3\xberasir -LRB- Old Norse masculine name from l\xc3\xadf and \xc3\xberasir -RRB- , were two humans foretold to survive Ragnar\xc3\xb6k and to repopulate the world .   Sometimes Liv can be a shortened version of Olivia .   Liv may refer to :  Gerd-Liv Valla -LRB- born 1948 -RRB- , leader of the Norwegian Confederation of Trade Unions  Jacob Liv Borch Sverdrup -LRB- 1775 -- 1841 -RRB- , Norwegian educator and farmer  Liv Aasen -LRB- born 1928 -RRB- , Norwegian politician for the Labour Party  Liv Andersen -LRB- 1919 -- 1997 -RRB- , Norwegian politician for the Labour Party  Liv Arnesen -LRB- born 1953 -RRB- , Norwegian cross-countr

In [15]:
#pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.5.0-cp38-cp38-manylinux1_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 2.9 MB/s eta 0:00:01
[?25hCollecting tensorflow<2.6,>=2.5.0
  Downloading tensorflow-2.5.0-cp38-cp38-manylinux2010_x86_64.whl (454.4 MB)
[K     |████████████████████████████████| 454.4 MB 79 kB/s  eta 0:00:011   |█                               | 14.9 MB 4.8 MB/s eta 0:01:31     |██▎                             | 32.7 MB 6.6 MB/s eta 0:01:04     |████▍                           | 63.0 MB 2.5 MB/s eta 0:02:34     |█████████▏                      | 130.5 MB 12.6 MB/s eta 0:00:26     |█████████▎                      | 131.2 MB 12.6 MB/s eta 0:00:26     |█████████████████████▏          | 301.3 MB 911 kB/s eta 0:02:49     |█████████████████████▉          | 309.9 MB 10.4 MB/s eta 0:00:14     |█████████████████████████▉      | 367.6 MB 4.4 MB/s eta 0:00:20     |██████████████████████████▏     | 372.1 MB 11.0 MB/s eta 0:00:08
[?25hCollecti

In [16]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [17]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [60]:
%%time
pt_vocab = bert_vocab.bert_vocab_from_dataset(
    ds.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 11min 3s, sys: 37.1 s, total: 11min 40s
Wall time: 10min 52s


In [61]:
print(pt_vocab[:10])
print(pt_vocab[100:110])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['ɕ', 'ɖ', 'ə', 'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɟ', 'ɡ', 'ɣ']


In [62]:
def write_vocab_file(filepath, vocab):
    print("Saving vocab file to {}".format(filepath))
    with open(filepath, 'w') as f:
        for token in vocab:
            print(token, file=f)

In [63]:
vocab_file_out = 'working/data/fever_vocab.txt'
write_vocab_file(vocab_file_out, pt_vocab)

Saving vocab file to working/data/fever_vocab.txt


In [64]:
print(len(pt_vocab))

7860


In [4]:
!wc -l working/data/fever_vocab.txt

7860 working/data/fever_vocab.txt


#### Load the vocab and initialize the tokenizer

In [18]:
import tensorflow_text as text
bert_tokenizer_params=dict(lower_case=True)
vocab_file_out = 'working/data/fever_vocab.txt'
pt_tokenizer = text.BertTokenizer(vocab_file_out, **bert_tokenizer_params)

In [35]:
for d in get_dataset().take(1):
    print(d)

tf.Tensor(b"[START] nikolaj coster waldau worked with the fox broadcasting company . [END] Nikolaj Coster-Waldau -LRB- -LSB- ne\xc9\xa1\xcc\x8aola\xc9\xaa\xcc\xaf k\xca\xb0\xca\x8csd\xcc\xa5\xc9\x90 \xcb\x88\xca\x8bald\xcc\xa5\xc9\x91\xca\x8a\xcc\xaf -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter . He graduated from Danish National School of Theatre in Copenhagen in 1993 . Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- . Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .   In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon . He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in t

In [47]:
from sklearn import preprocessing

labels = [d['label_text'] for d in formatted_train_data]
le = preprocessing.LabelEncoder()
le.fit(labels)
labels_enc = le.transform(labels)

In [49]:
train_labels = np.zeros(shape=(len(labels_enc),3))
for idx, val in enumerate(labels_enc):
    train_labels[idx][val]=1
print("A peek a the reshaped labels:")
print(train_labels[:5])
print("The datatypes of the training dataset, features={}, labels={}".format(type(labels_enc), type(train_labels)))

A peek a the reshaped labels:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]
The datatypes of the training dataset, features=<class 'numpy.ndarray'>, labels=<class 'numpy.ndarray'>


In [52]:
lbls = tf.reshape(tf.convert_to_tensor(train_labels, dtype=tf.int32), (train_labels.shape))
lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
lbls_ds

<TensorSliceDataset shapes: (3,), types: tf.int32>

In [186]:
import numpy as np
import tensorflow as tf
def get_train_data_generator():
    for data in formatted_train_data:
        claim = preprocess(data["claim"])
#         body_ids = [e[0] for e in data["evidence"]]
#         bodies = [database.get_doc_text(id) for id in set(body_ids)]
        lines = [get_doc_line(d[0],d[1]) for d in data["evidence"]]
        yield claim, " ".join(lines)
        
def get_train_dataset():
    generator = lambda: get_train_data_generator()
    return tf.data.Dataset.from_generator(
            generator, output_signature=(
            tf.TensorSpec(shape=(2, ), dtype=tf.string)))

In [187]:
raw_ds = get_train_dataset()

In [188]:
for d,e in raw_ds.take(1):
    print(e)
    print(".....\n\n")
    print(d)

tf.Tensor(b'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .', shape=(), dtype=string)
.....


tf.Tensor(b'[START] nikolaj coster waldau worked with the fox broadcasting company . [END]', shape=(), dtype=string)


In [189]:
raw_ds_enc_labls = tf.data.Dataset.zip((raw_ds, lbls_ds))
print(raw_ds_enc_labls.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Dev dataset

In [190]:
import numpy as np
import tensorflow as tf
def get_dev_data_generator():
    for data in dev_data_formatted:
        claim = preprocess(data["claim"])
#         body_ids = [e[0] for e in data["evidence"]]
#         bodies = [database.get_doc_text(id) for id in set(body_ids)]
        lines = [get_doc_line(d[0],d[1]) for d in data["evidence"]]
        yield claim, " ".join(lines)
        
def get_dev_dataset():
    generator = lambda: get_dev_data_generator()
    return tf.data.Dataset.from_generator(
            generator, output_signature=(
            tf.TensorSpec(shape=(2, ), dtype=tf.string)))

In [191]:
dev_ds = get_dev_dataset()

In [192]:
labels = [d['label_text'] for d in dev_data_formatted]
labels_enc = le.transform(labels)

In [193]:
dev_labels = np.zeros(shape=(len(labels_enc),3))
for idx, val in enumerate(labels_enc):
    dev_labels[idx][val]=1
print("A peek a the reshaped labels:")
print(dev_labels[:5])
print("The datatypes of the training dataset, features={}, labels={}".format(type(labels_enc), type(dev_labels)))

A peek a the reshaped labels:
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]
The datatypes of the training dataset, features=<class 'numpy.ndarray'>, labels=<class 'numpy.ndarray'>


In [194]:
lbls = tf.reshape(tf.convert_to_tensor(dev_labels, dtype=tf.int32), (dev_labels.shape))
print(lbls)
dev_lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
dev_lbls_ds

tf.Tensor(
[[1 0 0]
 [1 0 0]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]], shape=(9999, 3), dtype=int32)


<TensorSliceDataset shapes: (3,), types: tf.int32>

In [195]:
dev_ds_enc_labls = tf.data.Dataset.zip((dev_ds, dev_lbls_ds))
print(dev_ds_enc_labls.element_spec)

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


#### Build the sequences

In [196]:
for d, e in raw_ds_enc_labls.take(1):
    print(d[0])
    print("........\n\n")
    print(d[1])
    print("........\n\n")
    print(e)

tf.Tensor(b'[START] nikolaj coster waldau worked with the fox broadcasting company . [END]', shape=(), dtype=string)
........


tf.Tensor(b'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .', shape=(), dtype=string)
........


tf.Tensor([0 0 1], shape=(3,), dtype=int32)


In [197]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = raw_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = raw_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = raw_ds_enc_labls.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
dataset = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)
print(dataset.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


In [198]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 32000
def tokenize_and_pad(text, max_len):
    segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
    inp = segment.to_tensor(shape=[None, max_len])
    return inp[0]

h = dev_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
e = dev_ds_enc_labls.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = dev_ds_enc_labls.map(lambda x, y: y)
print(h)
print(e)
f = tf.data.Dataset.zip((h,e))
d = tf.data.Dataset.zip((f,l))
dataset_dev = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_dev)
print(dataset_dev.element_spec)

<MapDataset shapes: (60,), types: tf.int64>
<MapDataset shapes: (60,), types: tf.int64>
<BatchDataset shapes: (((64, 60), (64, 60)), (64, 3)), types: ((tf.int64, tf.int64), tf.int32)>
((TensorSpec(shape=(64, 60), dtype=tf.int64, name=None), TensorSpec(shape=(64, 60), dtype=tf.int64, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


In [199]:
from tensorflow import keras

In [200]:
### Simple model
vocab_size= 8000
dim = 100
inp1 = keras.Input(shape=(None, ), name = "hypothesis")
inp2 = keras.Input(shape=(None, ), name = "evidence")
embedding_hyp_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=dim)
embedding_evi_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=dim)
x_hyp = embedding_hyp_layer(inp1)
x_evi = embedding_evi_layer(inp2)
lstm_layer1 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim)))(x_hyp)
lstm_layer2 = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(dim)))(x_evi)
w = keras.layers.concatenate([lstm_layer1, lstm_layer2], axis = 1)
x1 = tf.keras.layers.Dense(16, activation='relu')(w)
x2 = tf.keras.layers.Dropout(0.1)(x1)
output = tf.keras.layers.Dense(3, activation='softmax')(x2)
model = keras.Model(inputs=[inp1, inp2], outputs=output)
model.compile(loss='categorical_crossentropy',
          optimizer=tf.keras.optimizers.Adam(), 
          metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hypothesis (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
evidence (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    800100      hypothesis[0][0]                 
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 100)    800100      evidence[0][0]                   
____________________________________________________________________________________________

In [201]:
rm -rf tmp/checkpoint_*

In [202]:
checkpoint_filepath = 'tmp/checkpoint_fever_nli'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)

In [203]:
history = model.fit(dataset, epochs = 12, validation_data=dataset_dev, callbacks=[stop_early, model_checkpoint_callback])

Epoch 1/12
Epoch 2/12
Epoch 3/12

KeyboardInterrupt: 