In [1]:
%load_ext autoreload  
%autoreload 2 
%matplotlib inline

In [2]:
import torch 
from tqdm import tqdm
import json
from datasets import load_dataset
from sentence_transformers import CrossEncoder

In [3]:
# get instances
dataset = load_dataset(
    'wiki_auto', 'auto_acl')

Reusing dataset wiki_auto (/Users/garylai/.cache/huggingface/datasets/wiki_auto/auto_acl/1.0.0/5ffdd9fc62422d29bd02675fb9606f77c1251ee17169ac10b143ce07ef2f4db8)
100%|██████████| 1/1 [00:00<00:00, 13.45it/s]


In [4]:
model = CrossEncoder('cross-encoder/stsb-roberta-large')

In [5]:
# get paraphrase simliarity scores
# scores = []
# for sample in tqdm(zip(dataset['full'][:500]['normal_sentence'], dataset['full'][:500]['simple_sentence'])):
#     normal_sentence, simple_sentence = sample
#     score = model.predict([normal_sentence, simple_sentence])
#     scores.append(score)

In [5]:
# get top k 
# score_tensor = torch.tensor(scores)
score_tensor = torch.load("./scores.pt")

In [6]:
# score_tensor.shape[0]

In [6]:
topk_scores, topk_indices = torch.topk(score_tensor, score_tensor.shape[0])

In [7]:
# utility
index_to_score = {}
for sample in zip(topk_indices, topk_scores):
    index, score = sample
    index_to_score[int(index)] = float(score)

In [8]:
def filter_indices(difference=80, similarity_threshold=0.8):
    """
    Args: 
        - difference: normal sentence must be at least `difference` characters longer to be selected
        - similarity_threshold: normal sentence and simple sentence must be at least this similarity and above
    """
    filtered_indices = []
    for i in topk_indices:
        index = int(i)
        # check normal sentence is sufficiently longer
        if (len(dataset['full'][index]['normal_sentence']) - len(dataset['full'][index]['simple_sentence'])) < difference:
            continue
        # check two sentences are sufficiently similar
        if index_to_score[index] < similarity_threshold:
            continue         
        filtered_indices.append(index)

    return filtered_indices

In [9]:
filtered_indices = filter_indices(difference=80, similarity_threshold=0.86)

In [10]:
len(filtered_indices)

1790

In [11]:
filtered_ds = dataset['full'].select(filtered_indices)

In [12]:
def clean_sentence(sentence):
    replacement = {
        "-LRB-": "(",
        "-RRB-": ")",
        "\n": ""
    }

    cleaned_sentence = sentence
    for substring in replacement.keys():
        if substring in cleaned_sentence:
            cleaned_sentence = cleaned_sentence.replace(substring, replacement[substring])
    return cleaned_sentence

# print(clean_sentence(filtered_ds['normal_sentence'][0]))

In [13]:
# check
for i, sample in enumerate(zip(filtered_ds['normal_sentence'], filtered_ds['simple_sentence'])):
    if i > 20:
        break
    normal_sentence, simple_sentence = sample
    normal_sentence, simple_sentence = clean_sentence(normal_sentence), clean_sentence(simple_sentence)
    print("-" * 80)
    print(f"normal_sentence: {normal_sentence} \nsimple_sentence: {simple_sentence} \noriginal_index: {filtered_indices[i]}" )
    print("score: ", index_to_score[filtered_indices[i]])

--------------------------------------------------------------------------------
normal_sentence: A tram stop , tram station , streetcar stop , or light rail station is a place designated for a tram , streetcar , or light rail vehicle to stop so passengers can board or alight it . 
simple_sentence: A tram stop is a place where a tram stops so passengers can get on or get off . 
original_index: 347272
score:  0.9704071283340454
--------------------------------------------------------------------------------
normal_sentence: On March 13 , 2018 , President Donald Trump announced he would nominate Haspel to be the Director of the Central Intelligence Agency , replacing Mike Pompeo — whom he tapped to become the new Secretary of State . 
simple_sentence: On March 13 , 2018 , Haspel was nominated by President Trump to become the CIA Director , replacing Mike Pompeo . 
original_index: 424382
score:  0.9700686931610107
---------------------------------------------------------------------------

In [14]:
import random
from collections import Set

# Write to JSON
unique_sentences = set()

Instances = []
for i in range(len(filtered_ds)):
    # prevent duplicates
    if filtered_ds[i]['normal_sentence'] in unique_sentences:
        continue
    
    unique_sentences.add(filtered_ds[i]['normal_sentence'])
    normal_sentence = clean_sentence(filtered_ds[i]['normal_sentence'])
    simple_sentence = clean_sentence(filtered_ds[i]['simple_sentence'])
    Instances.append({
        # 'input': normal_sentence,
        # 'output': [simple_sentence]
        "input": simple_sentence,
        "output": [normal_sentence]
    })

print("first instance: ", Instances[0])

  from collections import Set


first instance:  {'input': 'A tram stop is a place where a tram stops so passengers can get on or get off .', 'output': ['A tram stop , tram station , streetcar stop , or light rail station is a place designated for a tram , streetcar , or light rail vehicle to stop so passengers can board or alight it .']}


In [15]:
len(Instances)

1763

In [16]:
random.shuffle(Instances)
print("first instance: ", Instances[0])

first instance:  {'input': 'Many of the other books they say are written by Ezra ( First Esdras , 3-6 Ezra ) are written after that .', 'output': ['The canonical Book of Ezra and Book of Nehemiah are the oldest sources for the activity of Ezra , whereas many of the other books ascribed to Ezra ( First Esdras , 3 – 6 Ezra ) are later literary works dependent on the canonical books of Ezra and Nehemiah .']}


In [17]:
len(Instances)

1763

In [24]:
task_json = {
    "Contributors": ["Gary Haizhi Lai"],
    "Source": ["wiki_auto"],
    "Categories": ["Style Transfer"],
    "Definition": "In this task, we ask you to make the sentence sound more sophisticated without changing its general meaning. You can do so by using more advanced words, utilizing more complex sentence structures, and making the statement more precise by adding auxiliary information etc.",
    "Positive Examples": [
        {
            "input": "Rutherford discovered the radioactive half-life, the chemical element radon, and the three parts of radiation which he named Alpha, Beta, and Gamma .\n",
            "output": "In his early work , Rutherford discovered the concept of radioactive half-life, the radioactive element radon, and classified three types of radiations: alpha, beta and gamma radiation .\n",
            "explanation": "The output sentence used more complex words such as \"discovered\" and \"classified\"."
        },
        {
            "input": "The Inheritance Cycle is a series of fantasy books written by Christopher Paolini.",
            "output": "The Inheritance Cycle is a tetralogy of young adult high fantasy novels written by American author Christopher Paolini.",
            "explanation": "The output sentence is more specific and precise than the input sentence -- it contains auxiliary information such as that the Inheritance Cycle is a \"tetralogy of young adult high fantasy novels\"."
        },
        {
        "input": "The Greco-Roman or Graeco-Roman world , refers to geographical regions and countries who had the language , culture , government or religion of the ancient Greeks and Romans.",
        "output": "The Greco-Roman world , Greco-Roman culture , or the term Greco-Roman (spelled Graeco-Roman in the United Kingdom and the Commonwealth), when used as an adjective , as understood by modern scholars and writers , refers to those geographical regions and countries that culturally ( and so historically ) were directly , long-term , and intimately influenced by the language , culture , government and religion of the ancient Greeks and Romans.",
        "explanation": "The output sentence has more sophisticated vocabulary and sentence structures. It adds qualifiers and is much more specific."
        }
    ],
    "Negative Examples": [{"input": "Boryla, an American football quarterback, did not participate in the 1952 playoffs.",
                            "output": "Boryla was not in the 1952 playoffs.", 
                            "explanation": "The output sentence less sophisticated than the input sentence -- it is less specific and uses simpler words."},
                          {"input": "The wild population in China decreased to around 2,000 in 2005.",
                           "output": "By 2005, the wild population decreased to about 2,000.",
                           "explanation": "The output sentence changed the meaning of the input sentence."
                           }
                          ],
    "Instances": Instances
}

# export
with open('task934_wiki_auto_style_transfer.json', 'w') as fp:
    final_json = json.dumps(task_json, indent=4, ensure_ascii=False)
    print(final_json, file=fp)