<a href="https://colab.research.google.com/github/google-research/fool-me-twice/blob/master/notebooks/lmi_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2020 The Google AI Language Team Authors

Licensed under the Apache License, Version 2.0 (the "License");

In [0]:
# Copyright 2019 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Uses the LMI method of https://www.aclweb.org/anthology/D19-1341 to compare FEVER and FM2 "artefacts" at the bigram level.

LMI for a bigram $b$ and a label $l$ is defined as $LMI(b, l) = p(b, l)\cdot \log(\frac{p(l\mid b)}{p(l)})$, with the probabilities computed using the empirical counts for the dev / train set, respectively.

In [None]:
import collections
import json
import requests
import math

import nltk

nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
def fetch_dataset(url_path):
  examples = []
  data = requests.get(url_path)
  for l in data.content.decode('utf-8').split('\n'):
    if not l:
      continue    
    examples.append(json.loads(l))
  return examples

fm2_dev_dataset = fetch_dataset('https://raw.githubusercontent.com/google-research/fool-me-twice/blob/master/dataset/dev.jsonl')
fm2_train_dataset = fetch_dataset('https://raw.githubusercontent.com/google-research/fool-me-twice/blob/master/dataset/train.jsonl')

fever_dev_dataset = fetch_dataset('https://storage.googleapis.com/fool-me-twice-media/data/fever/dev.jsonl')
fever_train_dataset = fetch_dataset('https://storage.googleapis.com/fool-me-twice-media/data/fever/train.jsonl')

In [None]:
def safe_log(x):
  if x == 0:
    return 0
  return math.log(x)

def compute_lmi(examples):
  bigrams = collections.defaultdict(int)
  bigram_labels = collections.defaultdict(int)
  labels = collections.defaultdict(int)
  num_claims = 0

  for example in examples:
    num_claims += 1
    tokens = nltk.word_tokenize(example['text'].lower())
    label = example['label']
    labels[label] += 1
    for i in range(len(tokens) - 1):
      bigram = (tokens[i], tokens[i+1]) 
      bigrams[bigram] += 1
      bigram_labels[(bigram, label)] += 1
  
  def p_l_given_b(bigram, label):
    return bigram_labels[(bigram, label)] / bigrams[bigram]
  
  def p_b_and_l(bigram, label):
    return bigram_labels[(bigram, label)] / num_claims

  def p_l(label):
    return labels[label] / num_claims

  def lmi(bigram, label):
    return p_b_and_l(bigram=bigram, label=label) * safe_log(p_l_given_b(bigram=bigram, label=label) / p_l(label))

  lmi_per_label = {}
  for label in labels:
    lmi_per_label[label] = {
        bigram: lmi(bigram=bigram, label=label) for bigram in bigrams
    }

  return lmi_per_label

In [None]:
fm2_dev_lmi = compute_lmi(fm2_dev_dataset)
fm2_train_lmi = compute_lmi(fm2_train_dataset)

In [None]:
fever_dev_lmi = compute_lmi(fever_dev_dataset)
fever_train_lmi = compute_lmi(fever_train_dataset)

In [None]:
def compare_dev_and_train(train_data, dev_data, n=10):
    for label in train_data:
      highest_train = sorted(train_data[label].items(), key=lambda x: x[1], reverse=True)[:n]
      highest_dev = sorted(dev_data[label].items(), key=lambda x: x[1], reverse=True)[:n]

      print(f'{label}')

      def make_row(bigram, train_score, dev_score):
        if train_score is None:
          train_score = "NONE"
        else:
          train_score = int(train_score * 10**5)
        if dev_score is None:
          dev_score = "NONE"
        else:
          dev_score = int(dev_score * 10**5)

        print(f'{" ".join(bigram)} & ${train_score}$ & ${dev_score}$ \\\\')

      def remove_bigram_and_get_score(bigram, list):
        new_list = [(b, s) for (b, s) in list if b != bigram]
        score_list = [s for (b, s) in list if b == bigram]
        if score_list:
          score = score_list[0]
        else:
          score = None
        return (new_list, score)

      while highest_train and highest_dev:
        next_bigram = max(highest_train[0], highest_dev[0], key=lambda x: x[1])[0]
        highest_train, train_score = remove_bigram_and_get_score(next_bigram, highest_train)
        highest_dev, dev_score = remove_bigram_and_get_score(next_bigram, highest_dev)
        make_row(bigram=next_bigram, train_score=train_score, dev_score=dev_score)


      if highest_train:
        for (bigram, train_score) in highest_train:
          make_row(bigram=bigram, 
                   train_score=train_score,
                   dev_score=None)          
      else:
        for (bigram, dev_score) in highest_dev:
          make_row(bigram=bigram, 
                   train_score=None,
                   dev_score=dev_score)          

      print()      

In [None]:
print('FEVER')
compare_dev_and_train(train_data=fever_train_lmi, dev_data=fever_dev_lmi)

FEVER
SUPPORTS
is a & $482$ & $625$ \\
in the & $343$ & $499$ \\
in a & $481$ & $437$ \\
is an & $362$ & $455$ \\
a film & $297$ & $428$ \\
was in & $402$ & $NONE$ \\
an american & $280$ & $375$ \\
a person & $302$ & $353$ \\
was a & $319$ & $285$ \\
there is & $NONE$ & $302$ \\
of the & $NONE$ & $289$ \\
starred in & $254$ & $NONE$ \\

REFUTES
is not & $1420$ & $938$ \\
is only & $622$ & $938$ \\
did not & $859$ & $528$ \\
not a & $775$ & $481$ \\
was not & $729$ & $NONE$ \\
incapable of & $721$ & $710$ \\
only a & $455$ & $717$ \\
is incapable & $474$ & $551$ \\
was only & $NONE$ & $536$ \\
has only & $447$ & $NONE$ \\
yet to & $420$ & $384$ \\
of being & $NONE$ & $385$ \\



In [None]:
print('FM2')
compare_dev_and_train(train_data=fm2_train_lmi, dev_data=fm2_dev_lmi)

FM2
SUPPORTS
in the & $354$ & $1286$ \\
of a & $NONE$ & $464$ \\
in which & $NONE$ & $461$ \\
one of & $418$ & $NONE$ \\
book of & $NONE$ & $403$ \\
has been & $365$ & $NONE$ \\
, the & $NONE$ & $356$ \\
in a & $NONE$ & $352$ \\
the novel & $NONE$ & $351$ \\
and his & $NONE$ & $351$ \\
the united & $NONE$ & $348$ \\
the scarlet & $NONE$ & $345$ \\
more than & $268$ & $NONE$ \\
to be & $228$ & $NONE$ \\
of the & $222$ & $NONE$ \\
to the & $173$ & $NONE$ \\
with a & $166$ & $NONE$ \\
from the & $154$ & $NONE$ \\
years . & $138$ & $NONE$ \\

REFUTES
by a & $NONE$ & $562$ \\
mad , & $NONE$ & $502$ \\
, mad & $NONE$ & $502$ \\
on the & $NONE$ & $473$ \\
innocent iii & $NONE$ & $467$ \\
statue of & $NONE$ & $426$ \\
for his & $NONE$ & $407$ \\
pope innocent & $NONE$ & $407$ \\
mary , & $NONE$ & $365$ \\
queen of & $NONE$ & $365$ \\
the second & $338$ & $NONE$ \\
is a & $312$ & $NONE$ \\
was a & $307$ & $NONE$ \\
was the & $306$ & $NONE$ \\
is the & $233$ & $NONE$ \\
of his & $200$ & $NONE$ \