In [137]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import faiss
import torch
import seaborn as sns

In [None]:
class TextDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, idx):
        return self.embeddings[idx]

# Loading database

In [None]:
database_fr = load_dataset("FreedomIntelligence/alpaca-gpt4-french", split="train").to_pandas()
database_de = load_dataset("FreedomIntelligence/alpaca-gpt4-deutsch", split="train").to_pandas()
database_es = load_dataset("FreedomIntelligence/alpaca-gpt4-spanish", split="train").to_pandas()
database_ja = load_dataset("FreedomIntelligence/alpaca-gpt4-japanese", split="train").to_pandas()
database_ko = load_dataset("FreedomIntelligence/alpaca-gpt4-korean", split="train").to_pandas()
database_zh = load_dataset("FreedomIntelligence/alpaca-gpt4-chinese", split="train").to_pandas()

databases = [database_fr, database_de, database_es, database_ja, database_ko, database_zh]
database_df = pd.concat(databases)
database_df.reset_index(drop=True, inplace=True)

In [None]:
database_df["chat_response"] = [conv[1]["value"] for conv in database_df["conversations"]]

In [None]:
database_en = load_dataset("vicgalle/alpaca-gpt4", split="train").to_pandas()

In [None]:
list_of_responses  = database_df["chat_response"].tolist() + database_en["output"].tolist()
len(list_of_responses) == len(database_df) + len(database_en)
full_database_df = pd.DataFrame(list_of_responses, columns=['text'])
full_database_df.reset_index(inplace=True)

# Loading paraphrases

In [None]:
paws_en = load_dataset("maximedb/paws-x-all", "en", split="train", download_mode="force_redownload").to_pandas()
paws_de = load_dataset("maximedb/paws-x-all", "de", split="train", download_mode="force_redownload").to_pandas()
paws_fr = load_dataset("maximedb/paws-x-all", "fr", split="train", download_mode="force_redownload").to_pandas()
paws_es = load_dataset("maximedb/paws-x-all", "es", split="train", download_mode="force_redownload").to_pandas()
paws_ja = load_dataset("maximedb/paws-x-all", "ja", split="train", download_mode="force_redownload").to_pandas()
paws_ko = load_dataset("maximedb/paws-x-all", "ko", split="train", download_mode="force_redownload").to_pandas()
paws_zh = load_dataset("maximedb/paws-x-all", "zh", split="train", download_mode="force_redownload").to_pandas()

In [None]:
def preprocess_paws(df):
    df.reset_index(drop=True, inplace=True)
    df.drop("id", axis=1, inplace=True)
    df = df[df["sentence1"] != ""]
    df.dropna(subset="sentence1", inplace=True)
    df.drop_duplicates(subset="sentence1", inplace=True)
    return df

In [None]:
paws_en = preprocess_paws(paws_en)
paws_de = preprocess_paws(paws_de)
paws_fr = preprocess_paws(paws_fr)
paws_es = preprocess_paws(paws_es)
paws_ko = preprocess_paws(paws_ko)
paws_ja = preprocess_paws(paws_ja)
paws_zh = preprocess_paws(paws_zh)


In [None]:
# paws_en = paws_en[paws_en["label"] == 1]
# paws_de = paws_de[paws_de["label"] == 1]
# paws_fr = paws_fr[paws_fr["label"] == 1]
# paws_es = paws_es[paws_es["label"] == 1]
# paws_ja = paws_ja[paws_ja["label"] == 1]
# paws_ko = paws_ko[paws_ko["label"] == 1]
# paws_zh = paws_zh[paws_zh["label"] == 1]

In [None]:
paws_list = [paws_en, paws_de, paws_fr, paws_es, paws_ja, paws_ko, paws_zh]

# Creating faiss object

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device="cuda")

In [None]:
embeddings_database = model.encode(list_of_responses, batch_size=512, show_progress_bar=True, device="cuda")

In [None]:
norms = np.linalg.norm(embeddings_database, axis=1, keepdims=True)
norms[norms == 0] = 1
normalized_embeddings_database = embeddings_database / norms

In [None]:
d = normalized_embeddings_database.shape[1]
index = faiss.IndexFlatIP(d)
index.add(normalized_embeddings_database)

# Retrieval

In [None]:
paws_dict = {"en": {"dataset": paws_en},
            "de": {"dataset": paws_de},
            "fr": {"dataset": paws_fr},
            "es": {"dataset": paws_es},
            "ja": {"dataset": paws_ja},
            "ko": {"dataset": paws_ko},
            "zh": {"dataset": paws_zh}}

In [None]:
from tqdm import tqdm
k = 1
for lang in tqdm(paws_dict):
    nearest_neighbors = []
    similarities = []
    texts_to_test = paws_dict[lang]["dataset"]["sentence1"].tolist()
    test_embeddings = model.encode(texts_to_test, batch_size=512, show_progress_bar=True)
    norms = np.linalg.norm(test_embeddings, axis=1, keepdims=True)
    norms[norms == 0] = 1
    normalized_test_embeddings = test_embeddings / norms
    test_dataset = TextDataset(normalized_test_embeddings)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for batch in test_dataloader:
        sim, index_df = index.search(batch.detach().cpu().numpy(), k)
        indexes = [item for sublist in index_df for item in sublist]
        sim_unsqueezed = [item for sublist in sim for item in sublist]
        nearest_neighbors.extend(full_database_df.loc[indexes]["text"].tolist())
        similarities.extend(sim_unsqueezed)
    paws_dict[lang]["nearest_neighbors"] = nearest_neighbors
    paws_dict[lang]["similarities"] = similarities        
    

In [None]:
results_en = pd.DataFrame({"text": paws_dict["en"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["en"]["nearest_neighbors"],
                           "similarity": paws_dict["en"]["similarities"]})

results_de = pd.DataFrame({"text": paws_dict["de"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["de"]["nearest_neighbors"],
                           "similarity": paws_dict["de"]["similarities"]})

results_fr = pd.DataFrame({"text": paws_dict["fr"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["fr"]["nearest_neighbors"],
                           "similarity": paws_dict["fr"]["similarities"]})

results_es = pd.DataFrame({"text": paws_dict["es"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["es"]["nearest_neighbors"],
                           "similarity": paws_dict["es"]["similarities"]})

results_ja = pd.DataFrame({"text": paws_dict["ja"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["ja"]["nearest_neighbors"],
                           "similarity": paws_dict["ja"]["similarities"]})

results_ko = pd.DataFrame({"text": paws_dict["ko"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["ko"]["nearest_neighbors"],
                           "similarity": paws_dict["ko"]["similarities"]})

results_zh = pd.DataFrame({"text": paws_dict["zh"]["dataset"]["sentence1"],
                           "nearest_neighbor": paws_dict["zh"]["nearest_neighbors"],
                           "similarity": paws_dict["zh"]["similarities"]})

In [None]:
def make_predictions(results):
    results["prediction"] = [1 if sim > 0.9 else 0 for sim in results["similarity"]]

In [None]:
make_predictions(results_en)
make_predictions(results_de)
make_predictions(results_fr)
make_predictions(results_es)
make_predictions(results_ja)
make_predictions(results_ko)
make_predictions(results_zh)

In [None]:
def calculate_false_positive_ratio(predictions):
    return predictions[predictions == 1].count() * 100 / len(predictions)

In [None]:
fpr_en = calculate_false_positive_ratio(results_en["prediction"])
fpr_de = calculate_false_positive_ratio(results_de["prediction"])
fpr_es = calculate_false_positive_ratio(results_es["prediction"])
fpr_fr = calculate_false_positive_ratio(results_fr["prediction"])
fpr_ja = calculate_false_positive_ratio(results_ja["prediction"])
fpr_ko = calculate_false_positive_ratio(results_ko["prediction"])
fpr_zh = calculate_false_positive_ratio(results_zh["prediction"])