In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install hazm



In [3]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModel
import numpy as np
from hazm import POSTagger
import logging
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import notebook_login

In [9]:
class Generator:
    def __init__(self, model_name, pos_model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.unmasker = pipeline('fill-mask', model=model_name)
        self.tagger = POSTagger(model=pos_model_path)

    def generate_suggestions(self, masked_sentence):
        return self.unmasker(masked_sentence)

    def augment_text(self, input_text):
        orig_text_list = input_text.split()
        pos_tags = [tag for _, tag in self.tagger.tag(orig_text_list)]

        best_suggestions = {}

        for i, (word, tag) in enumerate(zip(orig_text_list, pos_tags)):
            if tag == 'VERB':
                continue
            new_text_list = orig_text_list[:i] + [self.tokenizer.mask_token] + orig_text_list[i+1:]
            new_mask_sent = ' '.join(new_text_list)
            print("-------------------------------------------------------------------------------------------------------------")
            print("Masked sentence->", new_mask_sent)

            augmented_text_list = self.generate_suggestions(new_mask_sent)

            for suggestion in augmented_text_list:
                generated_text = suggestion['sequence']
                yield word, generated_text


# Initialize the generator
model_name = "HooshvareLab/bert-fa-base-uncased"
pos_model_path = '/content/drive/MyDrive/data_augmentation/pos_tagger.model'
generator = Generator(model_name, pos_model_path)

# Define the input text
input_text = "من درباره این سریال نظری ندارم"

# Generate augmentations (example usage)
for word, generated_text in generator.augment_text(input_text):
    print(f"Original word: {word} | Augmented text: {generated_text}")


Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------------------------------------------------------------------------------------------------------------
Masked sentence-> [MASK] درباره این سریال نظری ندارم
Original word: من | Augmented text: من درباره این سریال نظری ندارم
Original word: من | Augmented text: فعلا درباره این سریال نظری ندارم
Original word: من | Augmented text: هنوز درباره این سریال نظری ندارم
Original word: من | Augmented text: اصلا درباره این سریال نظری ندارم
Original word: من | Augmented text: اما درباره این سریال نظری ندارم
-------------------------------------------------------------------------------------------------------------
Masked sentence-> من [MASK] این سریال نظری ندارم
Original word: درباره | Augmented text: من درباره این سریال نظری ندارم
Original word: درباره | Augmented text: من در این سریال نظری ندارم
Original word: درباره | Augmented text: من از این سریال نظری ندارم
Original word: درباره | Augmented text: من دربارهی این سریال نظری ندارم
Original word: درباره | Augmented text: من درمورد این سریال