<a href="https://colab.research.google.com/github/giuliofortini/NLP_SQuAD_Project/blob/gpt/SQUAD_evaluating_questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
import json
from google.colab import drive
drive.mount('/content/drive')

!pip install nltk --upgrade
import nltk

nltk.download('wordnet')
nltk.download('punkt') # if necessary...

import string

from nltk.translate import nist_score, bleu_score, meteor_score

# demo
#!/usr/bin/python
!pip install transformers
import transformers
from transformers import BertTokenizer
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
transformers.logging.set_verbosity_error() # suppress tokenizer sentences' length warnings

import numpy as np
import pandas as pd
#from utils import create_output_dict, preprocess_df, print_squad_sample, from_df_to_model_dict
#from model import build_model
import json
import sys

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.translate import bleu_score, nist_score, meteor_score



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Requirement already up-to-date: nltk in /usr/local/lib/python3.6/dist-packages (3.5)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




#Demo


#Metrics

##Utils

In [None]:
def paragraph(text, max_width=80, indent=0):
  if len(text) > max_width:
    cut = max_width
    while text[cut] != " ": cut -= 1
    return " "*indent + text[:cut].strip() + "\n" + paragraph(text[cut:], max_width, indent=indent)
  else:
    return " "*indent + text.strip()

In [None]:
def remove_tags(text):
  return (text.
          replace("[CTX]", "").
          replace("[QS]", "").
          replace("[QE]", "").
          strip())

In [None]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

def cosine_sim(text1, text2):
    vectorizer = TfidfVectorizer(tokenizer=normalize)
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

def jaccard(a, b):
  set_a = set(normalize(a))
  set_b = set(normalize(b))
  return len(set_a.intersection(set_b)) / len(set_b)

## Samples

{'bring_out', 'issue', 'print', 'publish', 'put_out', 'release', 'write'}

In [None]:
sample_id = 42
#sample_context, sample_questions = list(questions.items())[sample_id]

sample_context = "Bidirectional Encoder Representations from Transformers (BERT) is a Transformer-based machine learning technique for natural language processing (NLP) pre-training developed by Google. BERT was created and published in 2018 by Jacob Devlin and his colleagues from Google."
sample_questions = ["What is BERT?", 
                    "What is bert based on?", 
                    "Who released BERT in last two years?",
                    "Who published BERT in last two years?",
                    "What company did the creator of bert work for?",
                    "What does BERT stand for?",
                    "When did Dante Alighieri die?",
                    "Where is Alma Mater Studiorum Located?",
                    "Why have I been sitting here from an hour?"
                    ]

print(f"Sample context:\n{paragraph(remove_tags(sample_context), indent=2)}\n\nSample questions:")
for q in sample_questions:
  print("-", q)

Sample context:
  Bidirectional Encoder Representations from Transformers (BERT) is a
  Transformer-based machine learning technique for natural language processing
  (NLP) pre-training developed by Google. BERT was created and published in 2018
  by Jacob Devlin and his colleagues from Google.

Sample questions:
- What is BERT?
- What is bert based on?
- Who released BERT in last two years?
- Who published BERT in last two years?
- What company did the creator of bert work for?
- What does BERT stand for?
- When did Dante Alighieri die?
- Where is Alma Mater Studiorum Located?
- Why have I been sitting here from an hour?


#Metrics

In [None]:
def get_synonims(word):
  syn_set = set()
  for syn in wordnet.synsets(word):
    syn_set = syn_set.union(set(lemma.name() for lemma in syn.lemmas()))
  return syn_set if syn_set != [] else {word}

def reformulate_sentence(sentence, verbose=False):
  alternatives = [sentence]
  for word in sentence.split(" "):
    if verbose: print(f"\n{word}...........")
    synonims = get_synonims(word)
    for syn in synonims:
      altern = sentence.replace(word, syn.lower().replace("_", " "))
      if verbose: print(altern)
      alternatives.append(altern)
  return alternatives

def proposed(reference, sentence, return_alternative=False, verbose=False):
  scores = []
  if verbose: print("\n", sentence)
  for alternative in reformulate_sentence(sentence):
    score = jaccard(reference, alternative)
    if verbose: print(score, alternative)
    scores.append((score, alternative))
  
  if not return_alternative:
    return max(scores)[0]
  else:
    return max(scores)

In [None]:
proposed(sample_context, "what is bert")

0.6666666666666666

In [None]:
print(paragraph(sample_context), "\n\n")
print("{:<10}{:<10}{:<10}{:10}{:<10}\n{:.>80}".format("Jaccard", "Cosine", "METEOR", "Proposed", "Question", ""))
for question in sample_questions:
  meteor = meteor_score.single_meteor_score(sample_context, question)
  print("{:<10}{:<10}{:<10}{:<10}{}".format(round(jaccard(sample_context, question), 3), 
                                      round(cosine_sim(sample_context, question), 3),
                                      round(meteor, 3),
                                      round(proposed(sample_context, question), 3),
                                      question))

Bidirectional Encoder Representations from Transformers (BERT) is a
Transformer-based machine learning technique for natural language processing
(NLP) pre-training developed by Google. BERT was created and published in 2018
by Jacob Devlin and his colleagues from Google. 


Jaccard   Cosine    METEOR    Proposed  Question  
................................................................................
0.667     0.162     0.015     0.667     What is BERT?
0.4       0.115     0.03      0.4       What is bert based on?
0.286     0.094     0.03      0.429     Who released BERT in last two years?
0.429     0.131     0.045     0.429     Who published BERT in last two years?
0.222     0.081     0.015     0.333     What company did the creator of bert work for?
0.4       0.115     0.015     0.4       What does BERT stand for?
0.0       0.0       0.0       0.0       When did Dante Alighieri die?
0.167     0.032     0.015     0.167     Where is Alma Mater Studiorum Located?
0.111     0.052    

## Proposed

# Evaluation on the Test Set