In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [9]:
class Discriminator:

    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def get_cls_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", add_special_tokens=True)
        outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        return cls_embedding

    def extract_features(self, texts):
        features = []
        for text in texts:
            cls_embedding = self.get_cls_embedding(text)
            features.append(cls_embedding.detach().numpy().squeeze())
        features = np.vstack(features)
        return features

    def discriminator_loss(self, real_features, generated_features):
        real_features = torch.tensor(real_features)
        generated_features = torch.tensor(generated_features)
        tmp2 = torch.mean(torch.multiply(real_features.float(), generated_features.float()))
        loss = torch.tensor(-tmp2, requires_grad=True)
        return loss

    def similarity(self, input_text, generated_texts):
        orig_text_list = input_text.split()
        text1 = " ".join(orig_text_list)
        cls_embedding1 = self.get_cls_embedding(text1)

        best_suggestions = {}
        real_features = self.extract_features([text1])

        for generated_text in generated_texts:
            cls_embedding2 = self.get_cls_embedding(generated_text)
            similarity = cosine_similarity(cls_embedding1.detach().numpy().squeeze().reshape(1, -1), cls_embedding2.detach().numpy().squeeze().reshape(1, -1))
            similarity_score = similarity.item()

            generated_features = self.extract_features([generated_text])
            loss = self.discriminator_loss(real_features, generated_features)

            word = generated_text  # Assuming the whole generated text is considered as a suggestion
            if word not in best_suggestions or loss.item() < best_suggestions[word]['loss']:
                best_suggestions[word] = {'suggestion': generated_text, 'loss': loss.item(), 'similarity': similarity_score}
                print(f"Word: {generated_text}, Loss: {loss.item()}, Cosine Similarity: {similarity_score}")

        # Output the best suggestions for each word
        for word, suggestion_info in best_suggestions.items():
            print(f"suggestion for '{word}': {suggestion_info['suggestion']}, Loss: {suggestion_info['loss']}, Cosine Similarity: {suggestion_info['similarity']}")

def main():
    # Define the input text and generated texts
    generated_texts = ["من درباره این سریال حرفی ندارم", "من درباره این فیلم نظری ندارم", "من به این داستان نظری ندارم", "من درباره این سریال اظهارنظری ندارم"]
    input_text = "من درباره این سریال نظری ندارم"

    # Initialize the tokenizer and model
    model_name = "HooshvareLab/bert-fa-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Initialize the discriminator
    discriminator = Discriminator(tokenizer, model)

    # Calculate similarity and loss
    discriminator.similarity(input_text, generated_texts)

if __name__ == "__main__":
    main()


  loss = torch.tensor(-tmp2, requires_grad=True)


Word: من درباره این سریال حرفی ندارم, Loss: -0.7757735252380371, Cosine Similarity: 0.9617154598236084


  loss = torch.tensor(-tmp2, requires_grad=True)


Word: من درباره این فیلم نظری ندارم, Loss: -0.7806766629219055, Cosine Similarity: 0.9703912734985352


  loss = torch.tensor(-tmp2, requires_grad=True)


Word: من به این داستان نظری ندارم, Loss: -0.6737119555473328, Cosine Similarity: 0.8468949794769287
Word: من درباره این سریال اظهارنظری ندارم, Loss: -0.7619295120239258, Cosine Similarity: 0.9509677290916443
suggestion for 'من درباره این سریال حرفی ندارم': من درباره این سریال حرفی ندارم, Loss: -0.7757735252380371, Cosine Similarity: 0.9617154598236084
suggestion for 'من درباره این فیلم نظری ندارم': من درباره این فیلم نظری ندارم, Loss: -0.7806766629219055, Cosine Similarity: 0.9703912734985352
suggestion for 'من به این داستان نظری ندارم': من به این داستان نظری ندارم, Loss: -0.6737119555473328, Cosine Similarity: 0.8468949794769287
suggestion for 'من درباره این سریال اظهارنظری ندارم': من درباره این سریال اظهارنظری ندارم, Loss: -0.7619295120239258, Cosine Similarity: 0.9509677290916443


  loss = torch.tensor(-tmp2, requires_grad=True)
