First installing required libraries

In [18]:
!pip install transformers
!pip install hazm



importing packages

In [19]:
import pandas as pd
import pickle
import tensorflow as tf
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features
from tensorflow.keras import callbacks
import pickle
import numpy as np
from tqdm.notebook import tqdm
import hazm
import re
from string import punctuation
from transformers import TextClassificationPipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from tensorflow.keras import utils as np_utils
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

loading data


In [20]:
df = pd.read_excel('/content/drive/MyDrive/datasets/all_data.xlsx')
df=df.sample(frac=1)


cleaning the data

In [21]:
df = df.drop('Unnamed: 0',axis=1)
df = df.dropna()


counting labels


In [22]:
df['label'].value_counts()

4    2072
6    1731
0    1638
3     823
2     728
1     581
5     386
Name: label, dtype: int64

let's balance the data around 1000 samples


In [23]:
df_minority1 = df[df.label == 1]
df_minority2 = df[df.label == 2]
df_minority3 = df[df.label == 3]
df_minority5 = df[df.label == 5]
df_minority_upsampled1 = resample(df_minority1,
                                 replace=True,  # sample with replacement
                                 n_samples=1000,  # to match majority class
                                 random_state=123)  # reproducible results
df_minority_upsampled5 = resample(df_minority5,
                                 replace=True,  # sample with replacement
                                 n_samples=1000,  # to match majority class
                                 random_state=123)

df_minority_upsampled2 = resample(df_minority2,
                                 replace=True,  # sample with replacement
                                 n_samples=1000,  # to match majority class
                                 random_state=123)  # reproducible results
df_minority_upsampled3 = resample(df_minority3,
                                 replace=True,  # sample with replacement
                                 n_samples=1000,  # to match majority class
                                 random_state=123)


df_upsampled = pd.concat([df[df.label==0].sample(1000), df_minority_upsampled1,
                          df_minority_upsampled2,
                          df_minority_upsampled3,
                          df[df.label==4].sample(1000),
                          df_minority_upsampled5,
                          df[df.label==6].sample(1000)])

df = df_upsampled
df['label'].value_counts()

0    1000
1    1000
2    1000
3    1000
4    1000
5    1000
6    1000
Name: label, dtype: int64

In [24]:
df = df.sample(frac=1)


now let's process the data and clean it for nlp task

In [25]:
stop_words = pickle.load(open('/content/drive/MyDrive/datasets/text_stop_words.pkl','rb'))
stop_words

['میشم',
 'اینکه',
 'یا',
 '000',
 'همه',
 'حتی',
 'از',
 'سلام',
 'داره',
 'میخوام',
 'اما',
 'باید',
 'نمی',
 'اون',
 'یک',
 'شما',
 'نمیدونم',
 'می',
 'چند',
 'در',
 'پیش',
 'خیلی',
 'تو',
 'دوست',
 'اصلا',
 'چون',
 'هم',
 'd_',
 'ماه',
 'است',
 'وقتی',
 'شده',
 'با',
 'این',
 'کنم',
 'سال',
 'الان',
 'های',
 'ها',
 'ولی',
 'رو',
 'ساله',
 'هست',
 'زندگی',
 'به',
 'هر',
 'ندارم',
 'ما',
 'کرد',
 'هیچ',
 'یه',
 'چیکار',
 'کنید',
 'من',
 'شدم',
 'بهم',
 'کردم',
 'تا',
 'میشه',
 'برام',
 'که',
 'را',
 'میکنه',
 'هستم',
 'دارم',
 'میگه',
 'چه',
 'دیگه',
 'برای',
 'بعد']

In [26]:
lemmatizer = hazm.Lemmatizer()
normalizer = hazm.Normalizer()
def preprocessing(text):

    text = ''.join(c for c in text if not c.isdigit())
    text = ''.join(c for c in text if c not in punctuation)
    text = re.sub('\n',' ', text)
    text = re.sub('xD',' ', text)
    text = re.sub('؟',' ', text)
    text = re.sub('،',' ', text)
    text = normalizer.normalize(text)
    text = lemmatizer.lemmatize(text)
    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text

In [27]:
# apply the function to data
df['new_text'] = df['text'].apply(preprocessing)


spliting data to 2 part

In [28]:
x_train, x_test, Y_train, Y_test = train_test_split(df.new_text,df.label,stratify=df.label)

In [29]:
len(x_train), len(x_test),len(Y_train), len(Y_test)

(5250, 1750, 5250, 1750)

# **Building pretrained model by using parsbert **

In [30]:
# hyperparameters

MAX_LEN = 170
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'

In [31]:
label2id = {'personal':0,'family':1,'children':2,'sex':3,'couple':4 , 'education':5,'addiction':6}
id2label = {0:'personal',1:'family',2:'children',3:'sex',4:'couple',5:'education',6:'addiction'}

label2id , id2label

({'personal': 0,
  'family': 1,
  'children': 2,
  'sex': 3,
  'couple': 4,
  'education': 5,
  'addiction': 6},
 {0: 'personal',
  1: 'family',
  2: 'children',
  3: 'sex',
  4: 'couple',
  5: 'education',
  6: 'addiction'})

Downloading pars bert tokenizer

In [32]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "personal",
    "1": "family",
    "2": "children",
    "3": "sex",
    "4": "couple",
    "5": "education",
    "6": "addiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "addiction": 6,
    "children": 2,
    "couple": 4,
    "education": 5,
    "family": 1,
    "personal": 0,
    "sex": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



using pars bert tutorial to change form of data

In [33]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)

        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]

        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

    features = glue_convert_examples_to_features(
        examples,
        tokenizer,
        maxlen,
        output_mode=output_mode,
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features

    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [34]:
# changing form of data
train_dataset_base, train_examples = make_examples(tokenizer, x_train, Y_train.values.tolist(), maxlen=MAX_LEN)
test_dataset_base, test_examples = make_examples(tokenizer, x_test, Y_test.values.tolist(), maxlen=MAX_LEN)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, Y_test.values.tolist(), maxlen=MAX_LEN, is_tf_dataset=False)

  0%|          | 0/5250 [00:00<?, ?it/s]



  0%|          | 0/5250 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

checking the format of data

In [35]:
for value in train_dataset_base.take(2):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2  9231 60331  4247  3363  3531  3626 15252 14111  2814 25003 27841
  2897  3541 25003  3363     4     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     

In [36]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [37]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_validation_dataset(test_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(test_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

(328, 109)

In [43]:

early_stoping = callbacks.EarlyStopping(
        min_delta = 0.1,
         patience=2,
            restore_best_weights=True,)

defining the model

In [44]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

building the model

In [45]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
tf.config.run_functions_eagerly(True)

Fitting the model


In [46]:

%%time

model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=3,
    callbacks=[early_stoping],
    verbose=1)



Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 18min 57s, sys: 13.8 s, total: 19min 11s
Wall time: 21min 55s


<keras.callbacks.History at 0x7daafacfee90>

In [48]:

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

pipe("فرزندان رو چگونه میشود تربیت کرد")



[[{'label': 'personal', 'score': 0.015339689329266548},
  {'label': 'family', 'score': 0.018027350306510925},
  {'label': 'children', 'score': 0.9427851438522339},
  {'label': 'sex', 'score': 0.005560026504099369},
  {'label': 'couple', 'score': 0.009815280325710773},
  {'label': 'education', 'score': 0.00524911331012845},
  {'label': 'addiction', 'score': 0.0032234485261142254}]]

In [None]:
model.save_pretrained('persian_nlp_parsbert')