In [1]:
from collections import Counter
import string
import re
import argparse
import json
import sys
import numpy as np
import nltk
import random
import math
import os
import pickle
from tqdm import tqdm

In [2]:
import spacy
nlp = spacy.blank("en")

In [3]:
def pickler(path,pkl_name,obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def unpickler(path,pkl_name):
    with open(os.path.join(path, pkl_name) ,'rb') as f:
        obj = pickle.load(f)
    return obj

In [4]:
TRAINING = False

out_pkl_path = "./"

context_history_size = 2

if(TRAINING):
    file_path = "/home/bhargav/data/coqa/coqa-train-v1.0.json"
    out_pkl_name = "dataset_formatted_train.pkl"
    
else:
    file_path = "/home/bhargav/data/coqa/coqa-dev-v1.0.json"
    out_pkl_name = "dataset_formatted_dev.pkl"

In [5]:
def normalize(text):
    text = re.sub(
            r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
            str(text))
    text = re.sub(r"[ ]+", " ", text)
    text = re.sub(r"\!+", "!", text)
    text = re.sub(r"\,+", ",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.lower().strip()
    return text

In [6]:
def word_tokenize(text):
    return [x.text for x in nlp.tokenizer(normalize(text)) if x.text != " "]

In [7]:
def sent_tokenize(text):
    paragraph_out = []
    sentences =  nltk.sent_tokenize(text)
    for s in sentences:
        paragraph_out.append(word_tokenize(s))
    return paragraph_out

In [8]:
def score_overlap(sent,ans):
    sent_tok = set(sent)
    ans_tok = set(ans)
    return 100 - len(ans_tok.difference(sent_tok))

In [9]:
def find_matching_sentence(passage, span):
    matching_sentence = [0 for i in range(len(passage))]
    matching_scores = []
    for sent in passage:
        matching_scores.append(score_overlap(sent, span))
    best_match_index = np.array(matching_scores).argmax()
    matching_sentence[best_match_index] = 1
    return matching_sentence

def get_prev_ids(current_index, context_history_size, turn_id):
    ids = list(range(current_index-context_history_size, current_index))
    assert(turn_id != 0)
    for i in range(context_history_size-turn_id+1):
        ids[i]=0
    return ids

def get_rationales(dataset_in):
    rationales = []
    for passage in tqdm(dataset_in['data']):
        for i in range(len(passage['questions'])):
            r = normalize(passage['answers'][i]['span_text'])
            rationales.append(r)
#             rationale_sents = nltk.sent_tokenize(rationale)
#             num_sentences_in_rationale.append([len(rationale_sents)])
    return rationales

In [10]:
with open(file_path, encoding='utf8') as file:
    dataset_original = json.load(file)

In [11]:
rationales = get_rationales(dataset_original)

100%|██████████| 500/500 [00:00<00:00, 3136.06it/s]


In [12]:
num_sentences_in_rationale = [len(nltk.sent_tokenize(r)) for r in rationales]

In [13]:
print(min(num_sentences_in_rationale))
print(np.mean(num_sentences_in_rationale))
print(max(num_sentences_in_rationale))

1
1.0771639734435676
25


In [14]:
np.argmax(num_sentences_in_rationale)

222

In [15]:
nltk.sent_tokenize(rationales[222])

['hapter xxii northward, along the leeward coast of malaita, the _ariel_ worked her leisurely way, threading the colour riotous lagoon that lay between the shore reefs and outer reefs, daring passages so narrow and coral patched that captain winters averred each day added a thousand grey hairs to his head, and dropping anchor off every walled inlet of the outer reef and every mangrove swamp of the mainland that looked promising of cannibal life.',
 'for harley and villa kennan were in no hurry.',
 'so long as the way was interesting, they dared not how long it proved from anywhere to anywhere.',
 'during this time jerry learned a new name for himself or, rather, an entire series of names for himself.',
 "this was because of an aversion on harley kennan's part against renaming a named thing.",
 'a name he must have had, he argued to villa.',
 'haggin must have named him before he sailed on the _arangi_.',
 'therefore, nameless he must be until we get back to tulagi and find out his real

In [16]:
sentence_counts = [1 if n>1 else 0 for n in num_sentences_in_rationale ]

# fraction of rationales with more than 1 sentence

In [17]:
sum(sentence_counts)/len(sentence_counts)

0.053363397219090565