# Get combined dataset

Authors: Eneko Ruiz \<eneko.ruiz@ehu.eus\>

Copyright (C) 2021 Eneko Ruiz and DynaGroup i.T. GmbH

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets > /dev/null
!pip install contractions > /dev/null

In [None]:
from datasets import concatenate_datasets, load_dataset
import numpy as np
import pandas as pd
import os
import contractions
from xml.dom import minidom
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

In [None]:
raw_data_path = "/content/drive/MyDrive/Paraphrasing API/datasets/eneko/raw_dataset"
data_path = "/content/drive/MyDrive/Paraphrasing API/datasets/eneko/data"
combined_data_path = "/content/drive/MyDrive/Paraphrasing API/datasets/eneko"

In [None]:
# Quora (has no train-test-val split, has to be done manually)
quora_dataset = load_dataset("quora", split="train").filter(lambda x: x['is_duplicate'] == True)
quora_lst = []

for i in range(len(quora_dataset)):
    k = quora_dataset[i]['questions']['text'][0]
    v = quora_dataset[i]['questions']['text'][1]
    quora_lst.append([k, v])


pd_quora = pd.DataFrame(quora_lst,columns=["sentence", "paraphrase"])

Using custom data configuration default
Reusing dataset quora (/root/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)
Loading cached processed dataset at /root/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04/cache-4f77793f58a82c42.arrow


In [None]:
# PAWS (it has splits)
paws_dataset = load_dataset("paws", "labeled_final").filter(lambda x: x['label'] == 1)
paws_train = paws_dataset["train"]
paws_test = paws_dataset["test"]
paws_eval = paws_dataset["test"]
paws = concatenate_datasets([paws_train, paws_test, paws_eval])


df_paws = paws.to_pandas()
df_paws = df_paws.drop(["id", "label"], axis=1).rename(columns={"sentence1": "sentence", "sentence2": "paraphrase"})

Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34/cache-b80641487074cc5f.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34/cache-ca6b4eee67b51efc.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34/cache-864e08398d1cbaec.arrow


It seems they are the only available datasets on HuggingFace

In [None]:
# Download MSRP
# ! wget -O "msr_train.txt" "https://github.com/wasiahmad/paraphrase_identification/blob/master/dataset/msr-paraphrase-corpus/msr_paraphrase_train.txt" 
# ! wget -O "msr_test.txt" "https://github.com/wasiahmad/paraphrase_identification/blob/master/dataset/msr-paraphrase-corpus/msr_paraphrase_train.txt" 
# ! mv "msr_train.txt" "/content/drive/MyDrive/Paraphrasing API/datasets/eneko/dataset/"
# ! mv "msr_test.txt" "/content/drive/MyDrive/Paraphrasing API/datasets/eneko/dataset/"

In [None]:
# Download ParaNMT
# ! wget "http://www.cs.cmu.edu/~jwieting/para-nmt-50m-demo.zip"
# ! unzip -d para-nmt-50m-demo.zip && rm para-nmt-50m-demo.zip

In [None]:
msrp_distributes_dir = os.path.join(data_path, "msrp_distribute")
sentences_msrp = os.path.join(msrp_distributes_dir, "phrases.txt")
phrase_groups_msrp = os.path.join(msrp_distributes_dir, "phrase_groups.csv")

df = pd.read_csv(phrase_groups_msrp, header = 0)
data_dict_msrp = dict()

for index, row in df.iterrows():
    paraphrase_group_index = row['paraphrase_group_index']
    phrase_index = row['phrase_index']
    if paraphrase_group_index not in data_dict_msrp.keys():
        data_dict_msrp[paraphrase_group_index] = [phrase_index]
    else:
        data_dict_msrp[paraphrase_group_index].append(phrase_index)

keys_group_phrases_msrp = list(data_dict_msrp.values())
sentences_msrp = np.loadtxt(sentences_msrp, delimiter="\n", dtype=str)

sentences_map_msrp = [list(sentences_msrp[keys_group_phrases_msrp[i]]) for i in range(len(keys_group_phrases_msrp))]
df_msrp = pd.DataFrame(columns=["sentence", "paraphrase"])
i = 0
for sentence_tuple in sentences_map_msrp:
    for paraphrase in sentence_tuple[1:]:
        df_msrp.loc[i] = [sentence_tuple[0], paraphrase]
        i += 1

In [None]:
opinosis_dir = os.path.join(data_path, "opinosis_distribute")
sentences_opinosis = os.path.join(opinosis_dir, "phrases.txt")
phrase_groups_opinosis = os.path.join(opinosis_dir, "phrase_groups.csv")

df = pd.read_csv(phrase_groups_opinosis, header = 0)
data_dict_opinosis = dict()

for index, row in df.iterrows():
    paraphrase_group_index = row['paraphrase_group_index']
    phrase_index = row['phrase_index']    
    if paraphrase_group_index not in data_dict_opinosis.keys():
        data_dict_opinosis[paraphrase_group_index] = [phrase_index]
    else:
        data_dict_opinosis[paraphrase_group_index].append(phrase_index)

keys_group_phrases_opinosis = list(data_dict_opinosis.values())
sentences_opinosis = np.loadtxt(sentences_opinosis, delimiter="\n", dtype=str)

sentences_map_opinosis = [list(sentences_opinosis[keys_group_phrases_opinosis[i]]) for i in range(len(keys_group_phrases_opinosis))]
df_opinosis  = pd.DataFrame(columns=["sentence", "paraphrase"])
i = 0
for sentence_tuple in sentences_map_opinosis:
    for paraphrase in sentence_tuple[1:]:
        df_opinosis.loc[i] = [sentence_tuple[0], paraphrase]
        i += 1

In [None]:
# This required a loooooot of RAM and probably going to crash
# sentences_paranmt = os.path.join(data_path, "para-nmt-5m-processed.txt")
# para_nmt = np.genfromtxt(sentences_paranmt, delimiter="\t", dtype=str, missing_values="1997", invalid_raise = False)

In [None]:
sentences_good = os.path.join(data_path, "data_good")

p4p_dir = os.path.join(sentences_good, "P4P_corpus_v1.xml")
etpc_dir = os.path.join(sentences_good, "text_pairs.xml")

data_p4p = minidom.parse(p4p_dir)
items = data_p4p.getElementsByTagName('relacio')

data_p4p = []

for i, elem in enumerate(items):
    atributes = elem.getElementsByTagName("frase")
    k = 0
    aux_lst = []
    for atr in atributes:
        k += 1
        aux_lst.append(atr.firstChild.nodeValue)
        if k == 2:
            data_p4p.append(aux_lst)
            break
            
df_p4p = pd.DataFrame(data_p4p, columns=["sentence", "paraphrase"])

In [None]:
tree = ET.parse(etpc_dir)
root = tree.getroot()

data_etpc = []

for i in range(len(root)):
    frase1 = root[i][4].text
    frase2 = root[i][5].text
    data_etpc.append([frase1, frase2])

df_etpc = pd.DataFrame(data_etpc, columns=["sentence", "paraphrase"])

In [None]:
size_questions = len(pd_quora)
size_sentences = len(df_opinosis) + len(df_msrp) + len(df_paws) + len(df_etpc) + len(df_p4p)
frac = size_sentences / size_questions

difference = size_questions - size_sentences
proportion = 1 / 4

In [1]:
# FIRST APPROACH --> AUGMENT DATASET WITH NMT DATA
number_instances = int(difference / proportion)
idx = np.random.randint(low=0, high=len(para_nmt), size=number_instances)

para_nmt_shuffle = list(para_nmt[idx])
df_nmt = pd.DataFrame(para_nmt_shuffle, columns=["sentence", "paraphrase"])

df = pd.concat([pd_quora, df_opinosis, df_msrp, df_paws, df_nmt, df_etpc, df_p4p])
df = df.sample(frac=1).reset_index(drop=True)
dir_save = os.path.join(combined_data_path, "augmented_raw_text.csv")
df.to_csv(dir_save)  

In [None]:
# SECOND APPROACH --> REDUCE THE SIZE OF QUORA QUESTIONS
df_quora_reduced = pd_quora.sample(frac=frac * proportion).reset_index(drop=True)

df = pd.concat([df_quora_reduced, df_opinosis, df_msrp, df_paws, df_etpc, df_p4p])
df = df.sample(frac=1).reset_index(drop=True)

dir_save = os.path.join(combined_data_path, "reduced_raw_text.csv")
df.to_csv(dir_save)  

df_train, df_test_whole = train_test_split(df, train_size=0.80, shuffle=True, random_state=42)
df_val, df_test  = train_test_split(df_test_whole, test_size=0.05/0.20, shuffle=True, random_state=42)

df_train.to_csv(f"{combined_data_path}/reduced_raw_text_train.csv")
df_val.to_csv(f"{combined_data_path}/reduced_raw_text_val.csv")
df_test.to_csv(f"{combined_data_path}/reduced_raw_text_test.csv")

In [None]:
len(df)

45811