Filter Dataset According to Dissimilarity

Author: Fatma Ben Ayed

Copyright (C) 2021 Fatma Ben Ayed and DynaGroup i.T. GmbH

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sacrebleu > /dev/null
!pip install rouge > /dev/null
!pip install datasets > /dev/null
!pip install rouge_score > /dev/null
!pip install sentencepiece > /dev/null
!pip install bert_score > /dev/null
!pip install wget > /dev/null

In [25]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split

In [3]:
import sys
sys.path.append("/content/drive/MyDrive/Paraphrasing API/src")

from our_metrics import Metrics

the_metrics = Metrics()

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Your python interpreter: /usr/bin/python3


In [4]:
# the_metrics.install_bleurt_model()

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Paraphrasing API/datasets/eneko/reduced_raw_text.csv")
df

Unnamed: 0.1,Unnamed: 0,sentence,paraphrase
0,0,"For the first time in league history , Gary Ga...",For the first time in the history of the leagu...
1,1,Currently Rozova is contracted with Wilhelmina...,Rozova is currently signed with Wilhelmina Mod...
2,2,"The work was concluded in 2003 in his fifth , ...","The work was completed in his fifth in 2003 , ..."
3,3,How do small scale industries differ from larg...,What are some unique characteristics of large ...
4,4,"In 2014 , the festival was once more held at t...","In 2014 , the festival was held again in the S..."
...,...,...,...
45806,45806,Both Iran and Saudi Arabia rejected the use of...,Both Iran and Saudi Arabia rejected the use of...
45807,45807,What are the safety precautions on handling sh...,What are the safety precautions on handling sh...
45808,45808,The Total Total Evaporation is The Radiators '...,Total Evaporation is the fourth album by The R...
45809,45809,He is mentioned twice in Aleister Crowley 's n...,He is mentioned twice in Aleister Crowley 's n...


In [7]:
df.sentence = df.sentence.astype(str)
df.paraphrase = df.paraphrase.astype(str)

In [8]:
input1 = df.iloc[1, 1]
output1 = df.iloc[1, 2]

In [14]:
def get_translation_table():
  # Punctuations according to python
  punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

  # Good symbols that should not be removed
  good_symbols = '\'`-'

  # Bad punctuation that should be removed
  bad_punctuations = punctuations.translate("".maketrans("", "", good_symbols))
  
  return "".maketrans(bad_punctuations, ' '*len(bad_punctuations))
  
translation_table = get_translation_table()

def tokenize_text_for_diversity(text):
  # Remove bad punctuation
  text = text.translate(translation_table)
  return text.lower().strip().split()

def tokenize_for_diversity(input, output):
  assert (type(input) == type(output))

  if type(input) == str:
    return tokenize_text_for_diversity(input), tokenize_text_for_diversity(output)
  
  # Input and output are batches
  input_batch = [tokenize_text_for_diversity(text) for text in input]
  output_batch = [tokenize_text_for_diversity(text) for text in output]
  return input_batch, output_batch

In [15]:
input_tok, output_tok = tokenize_for_diversity(input1, output1)

In [16]:
output_tok

['rozova',
 'is',
 'currently',
 'signed',
 'with',
 'wilhelmina',
 'models',
 'in',
 'hong',
 'kong',
 'and',
 'style',
 'international',
 'management',
 'in',
 'new',
 'york']

In [17]:
BLEUscore = nltk.translate.bleu_score.sentence_bleu([input_tok], output_tok, weights = [1])
print(BLEUscore)

0.9411764705882353


In [20]:
diverse_dataset = []
for i in range(len(df)):
    sentence = df.iloc[i,1]
    paraphrase = df.iloc[i,2]
    input_tok, output_tok = tokenize_for_diversity(sentence, paraphrase)
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([input_tok], output_tok, weights = [1])
    if (BLEUscore < 0.8):
       diverse_dataset.append([sentence, paraphrase])

In [21]:
df_diverse = pd.DataFrame(diverse_dataset, columns=["sentence", "paraphrase"])
df_diverse

Unnamed: 0,sentence,paraphrase
0,How do small scale industries differ from larg...,What are some unique characteristics of large ...
1,What is your New Year’s Resolution(s) for 2017?,What's your New Year's resolution for 2017?
2,Helicopters hovered throughout the day over th...,Helicopters hovered over al-Khalidiya into the...
3,Is it possible to gain height after 20?,How can I increase in height after 20 years?
4,What are all the govt jobs for mechanical engi...,What are the government jobs for a mechanical ...
...,...,...
15307,Is Hillary Clinton a war hawk?,Why does Hillary Clinton have so hawkish forei...
15308,"The document stated: ""This report does not att...","The report released Monday simply says , This ..."
15309,The U.S. Supreme Court has previously ruled th...,The Supreme Court long ago held that students ...
15310,What's your favorite movies and why?,What are some of your favorite movies?


In [22]:
def remove_end_spaces(string):
    return "".join(string.rstrip())

In [24]:
diverse_dataset = []
for i in range(len(df_diverse)):
    s1 = remove_end_spaces(df_diverse.iloc[i, 0])
    s2 = remove_end_spaces(df_diverse.iloc[i, 1])
    diverse_dataset.append([s1, s2])

df_diverse2 = pd.DataFrame(diverse_dataset, columns=["sentence", "paraphrase"])
df_diverse2

Unnamed: 0,sentence,paraphrase
0,How do small scale industries differ from larg...,What are some unique characteristics of large ...
1,What is your New Year’s Resolution(s) for 2017?,What's your New Year's resolution for 2017?
2,Helicopters hovered throughout the day over th...,Helicopters hovered over al-Khalidiya into the...
3,Is it possible to gain height after 20?,How can I increase in height after 20 years?
4,What are all the govt jobs for mechanical engi...,What are the government jobs for a mechanical ...
...,...,...
15307,Is Hillary Clinton a war hawk?,Why does Hillary Clinton have so hawkish forei...
15308,"The document stated: ""This report does not att...","The report released Monday simply says , This ..."
15309,The U.S. Supreme Court has previously ruled th...,The Supreme Court long ago held that students ...
15310,What's your favorite movies and why?,What are some of your favorite movies?


In [None]:
df_diverse2.to_csv('dataset.csv')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Paraphrasing API/datasets/Eneko_diverseDataset/dataset.csv")

In [None]:
df_train, df_val = train_test_split(df, train_size=0.8, shuffle=True)

In [None]:
df_train.to_csv("/content/drive/MyDrive/Paraphrasing API/datasets/Eneko_diverseDataset/diverse_train.csv")
df_val.to_csv("/content/drive/MyDrive/Paraphrasing API/datasets/Eneko_diverseDataset/diverse_val.csv")