<a href="https://colab.research.google.com/github/eshaghjahangiri/Finetuning_ParsBERT_LLM/blob/main/finetuning_ParsBERT_SahamyabTwits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required packages
!pip install -q transformers
!pip install -q hazm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.6/892.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.8/238.8 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-stubs 2.0.3.230814 requires numpy>=1.25.0; python_version >= "3.9", but you have 

In [4]:
import pandas as pd
import numpy as np
import re
import os
import transformers
from hazm import Normalizer, word_tokenize, Stemmer, Lemmatizer, stopwords_list

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm
import copy
import collections

In [5]:
# mount google drive to load the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
folder_path = '/content/drive/My Drive/'

In [7]:
file_path = os.path.join(folder_path, 'sampletwits_forlabelling.csv')

In [26]:
df = pd.read_csv(file_path, encoding='utf-8')
df.columns = ['content', 'label']

In [27]:
df.head()

Unnamed: 0,content,label
0,#خزامیا زامیاد امروز مجمع افزایش سرمایه تصویب ...,1
1,#ذوب ۴۵ #بدبین,0
2,#ذوب افزایش نرخ سود بانکی تصویب نشد ایلنا عضو ...,1
3,#ثنوسا اونایی میفروشن دارن زنی میکنن وگرنه ریس...,1
4,#وبهمن صف خرید مبارک سهامداران صبور #خوشبین,1


In [28]:
# label 1 means the content's sentiment is positive, and 0 means negative
df['label'].value_counts()

label
1    10000
0    10000
Name: count, dtype: int64

In [29]:
# print data information
print('data information')
print(df.info(), '\n')

# print missing values information
print('missing values stats')
print(df.isnull().sum(), '\n')

data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  20000 non-null  object
 1   label    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB
None 

missing values stats
content    0
label      0
dtype: int64 



In [30]:
# maybe there are some duplicate messages in the column content, so let's remove it!
df = df.drop_duplicates(subset=['content'], keep='first')
df = df.reset_index(drop=True)

In [32]:
len(df)

19436

In [39]:
# preprocess (clean and tokenization) function for Persian language using Hazm library

stopwords = stopwords_list()
normalizer = Normalizer()
lemmatizer = Lemmatizer()

puncs = ['،', '.', ',', ':', ';', '"', "'", '/', '\\', '_', '-']
punc_re = '[' + re.escape(''.join(puncs)) + ']'

def preprocess_persian(text):
    # removing url
    url_re = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_re, '', text)
    # removing some characters
    text = re.sub(r'[_"«»<>]', '', text)
    text = re.sub(r'[\[\]]', '', text)
    # removing latin characters and tab
    text = re.sub("([A-Za-z\t])", "", text)
    # removing repeating ـ which is often used for designing in Persian
    text = re.sub('ـ*', '', text)
    # removing references like [۱]
    text = re.sub("\[[۱۲۳۴۵۶۷۸۹۰1234567890]*\]"," ", text)
    text = re.sub(r'[\u200c\u200d]', '', text)
    # removing hashtags
    text = re.sub(r'#\S+', '', text)
    text = re.sub(punc_re, '', text)

    # clean html
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, '', text)

    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # Replace sequences of + with 'مثبت'
    text = re.sub(r'\++', 'مثبت', text)

    # Replace sequences of - with 'منفی'
    text = re.sub(r'\-+', 'منفی', text)

    # normalize
    text = normalizer.normalize(text)
    text = text.rstrip()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Removing stopwords
    tokens = [token for token in tokens if token not in stopwords]

    # Join tokens back
    processed_text = ' '.join(tokens)

    return processed_text

In [40]:
df['content'] = df['content'].apply(preprocess_persian)

In [41]:
df.head()

Unnamed: 0,content,label,comment_len_by_words
0,زامیاد امروز مجمع افزایش سرمایه تصویب افزایش ۱...,1,65
1,۴۵,0,1
2,افزایش نرخ سود بانکی تصویب نشد ایلنا عضو شورای...,1,194
3,اونایی میفروشن دارن زنی میکنن وگرنه ریسک فروش ...,1,9
4,صف خرید مبارک سهامداران صبور,1,5


In [42]:
# calculate the length of comments based on their words
df['comment_len_by_words'] = df['content'].apply(lambda t: len(word_tokenize(t)))

In [43]:
min_max_len = df["comment_len_by_words"].min(), df["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 0 	Max: 235


In [44]:
def data_percentage_length(data, less_than=235, greater_than=0.0, col='comment_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [46]:
data_percentage_length(df, 235, 2)

Texts with word length of greater than 2 and less than 235 includes 93.17% of the whole!


In [47]:
minlim, maxlim = 2, 235

In [51]:
# remove comments with the
df['comment_len_by_words'] = df['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
df = df.dropna(subset=['comment_len_by_words'])
df = df.reset_index(drop=True)

In [52]:
df.isnull().sum()

content                 0
label                   0
comment_len_by_words    0
dtype: int64

In [55]:
fig = go.Figure()

groupby_rate = df.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of sentiment within comments',
    xaxis_title_text='Sentiment',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

The dataset is fairly balanced as we have around 51\% positive sentiment and 49\% negative sentiments

In [56]:
# mapping label

id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

df['label'] = df['label'].map(id2label)

In [57]:
df.head()

Unnamed: 0,content,label,comment_len_by_words
0,زامیاد امروز مجمع افزایش سرمایه تصویب افزایش ۱...,positive,65.0
1,افزایش نرخ سود بانکی تصویب نشد ایلنا عضو شورای...,positive,194.0
2,اونایی میفروشن دارن زنی میکنن وگرنه ریسک فروش ...,positive,9.0
3,صف خرید مبارک سهامداران صبور,positive,5.0
4,فروش سود هیچگاه اشتباه یه عده بگن فروشنده پشیم...,negative,14.0


In [58]:
# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

In [60]:
train, test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['label'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['label'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['content'].values.tolist(), train['label'].values.tolist()
x_valid, y_valid = valid['content'].values.tolist(), valid['label'].values.tolist()
x_test, y_test = test['content'].values.tolist(), test['label'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)

(14668, 3)
(1630, 3)
(1811, 3)


In [61]:
from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

In [62]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cuda:0
CUDA is available!  Training on GPU ...


In [63]:
# general config
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 1.0


MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/drive/My Drive/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/pytorch_model_finetune_parsbert.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [64]:
# setup the tokenizer and configuration

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "positive": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



In [65]:
idx = np.random.randint(0, len(train))
sample_content = train.iloc[idx]['content']
sample_label = train.iloc[idx]['label']

print(f'Sample: \n{sample_content}\n{sample_label}')

Sample: 
اون بندگان خدایی فرض مشابه پارسال میکنه ضرر خارج خواهندشد سال گذشته توسط حقوقی عمده بردهشد قیمت ۵۸۰ تومان ۲۰ میلیون ملت قالب مساله تکرار پذیر تحلیل تکنیکال جواب نمیده منتظر موند محدود ه ۲۵۰ الی ۳۰۰ طولانی فرصت خرید
negative


In [66]:
tokens = tokenizer.tokenize(sample_content)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'  Content: {sample_content}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

  Content: اون بندگان خدایی فرض مشابه پارسال میکنه ضرر خارج خواهندشد سال گذشته توسط حقوقی عمده بردهشد قیمت ۵۸۰ تومان ۲۰ میلیون ملت قالب مساله تکرار پذیر تحلیل تکنیکال جواب نمیده منتظر موند محدود ه ۲۵۰ الی ۳۰۰ طولانی فرصت خرید
   Tokens: اون بندگان خدایی فرض مشابه پارسال میکنه ضرر خارج خواهندشد سال گذشته توسط حقوقی عمده بردهشد قیمت ۵۸۰ تومان ۲۰ میلیون ملت قالب مساله تکرار پذیر تحلیل تکنیکال جواب نمیده منتظر موند محدود ه ۲۵۰ الی ۳۰۰ طولانی فرصت خرید
Token IDs: [5536, 23372, 16041, 5875, 4942, 14104, 12702, 8957, 3968, 92767, 2844, 3421, 3158, 6006, 5409, 5425, 2881, 3377, 29514, 3832, 3109, 3399, 5066, 5094, 6049, 5926, 4108, 4683, 33478, 6173, 6608, 6878, 31957, 3792, 1378, 8830, 4527, 5870, 4977, 5175, 3535]


In [68]:
encoding = tokenizer.encode_plus(
    sample_content,
    max_length=235,
    truncation=True,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

print(f'Keys: {encoding.keys()}\n')
for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

input_ids:
tensor([[    2,  5536, 23372, 16041,  5875,  4942, 14104, 12702,  8957,  3968,
         92767,  2844,  3421,  3158,  6006,  5409,  5425,  2881,  3377, 29514,
          3832,  3109,  3399,  5066,  5094,  6049,  5926,  4108,  4683, 33478,
          6173,  6608,  6878, 31957,  3792,  1378,  8830,  4527,  5870,  4977,
          5175,  3535,     4,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [70]:
class NewsDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for the Sahamyab dataset. """

    def __init__(self, tokenizer, contents, targets=None, label_list=None, max_len=235):
        self.contents = contents
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len


        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}

    def __len__(self):
        return len(self.contents)

    def __getitem__(self, item):
        content = str(self.contents[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        inputs = {
            'content': content,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)

        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = NewsDataset(
        contents=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len,
        label_list=label_list)

    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [71]:
label_list = ['negative', 'positive']
train_data_loader = create_data_loader(train['content'].to_numpy(), train['label'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(valid['content'].to_numpy(), valid['label'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test['content'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [72]:
sample_data = next(iter(train_data_loader))

print(sample_data.keys())

print(sample_data['content'])
print(sample_data['input_ids'].shape)
print(sample_data['input_ids'][0, :])
print(sample_data['attention_mask'].shape)
print(sample_data['attention_mask'][0, :])
print(sample_data['token_type_ids'].shape)
print(sample_data['token_type_ids'][0, :])
print(sample_data['targets'].shape)
print(sample_data['targets'][0])

dict_keys(['content', 'input_ids', 'attention_mask', 'token_type_ids', 'targets'])
['فهمیده مردم بلاتکلیفی میترسن امشب یه جوری سخنرانی میکنه آچمز میکنه چنان مردم بلا تکلیفی میزاره فردا آش همون آشه کاسه همون کاسه', 'دلخوش مقدار نباشیدفاراک نهتنها آخرت درست دنیای آباد', '۳۳۰ گیرم نیومد بازم بفروشید بخرم مثبت', 'گفتن بودا کی تواین قیمت نفروشه روز کارش تمومه', 'بنظرتون الان بخرم میاد پایینتر ؟', 'اذیت کنه استارت اینم درنظر بگیرید موقع استارت دیگه لنگش نمیکنه سریع هدفشو میزنه کسرا سهمای خوبه بازاره دید کوتاه بخریش اذیت بشی', 'کاری صف خرید فروش ندارم اینایی ک میخرن دقیقا چیو خریدن کارخانهای تعطیله زیان کارگراشو اخراج پول پرداخت هزینه آب برقشو ندارن ب زمین زمان بد بیرا میگن عزیزان خدا عقل برا تشخیص ( گفتم بخرید بفروشید گیر نیفتید )', 'اشغالترین بورس ایرانه صادرات دارن مدیر عامل میکه صادراتمون نمیصرفه سر سره برا کردش ماله صادرات میکنیمتا تهشو بخونینسهم دفترسازی ۲ ریال ضرر ریال مجمع سود ندنیعنی شاه دزدن شرکت', 'سلام دوستان ۶۳۷۵ تماس گرفتم گفتن صورتهای مالی حسابرسی پیشبینی ۹۶ حسابرسی ۶ ماهه حساب

In [73]:
sample_test = next(iter(test_data_loader))
print(sample_test.keys())

dict_keys(['content', 'input_ids', 'attention_mask', 'token_type_ids'])


In [74]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [75]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Tue Jul  9 10:36:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   37C    P8              16W /  72W |      4MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [76]:
pt_model = SentimentModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

pt_model <class '__main__.SentimentModel'>


In [77]:
# sample data output

sample_data_comment = sample_data['content']
sample_data_input_ids = sample_data['input_ids']
sample_data_attention_mask = sample_data['attention_mask']
sample_data_token_type_ids = sample_data['token_type_ids']
sample_data_targets = sample_data['targets']

# available for using in GPU
sample_data_input_ids = sample_data_input_ids.to(device)
sample_data_attention_mask = sample_data_attention_mask.to(device)
sample_data_token_type_ids = sample_data_token_type_ids.to(device)
sample_data_targets = sample_data_targets.to(device)


# outputs = F.softmax(
#     pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids),
#     dim=1)

outputs = pt_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids)
_, preds = torch.max(outputs, dim=1)

print(outputs[:5, :])
print(preds[:5])

tensor([[ 0.1797,  0.4144],
        [-0.1497,  0.1363],
        [ 0.3115,  0.2185],
        [ 0.3476,  0.3139],
        [ 0.0426,  0.3420]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([1, 1, 0, 0, 1], device='cuda:0')


In [78]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):

            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)

    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model,
             data_loader,
             loss_fn,
             optimizer,
             scheduler,
             step=0,
             print_every_step=100,
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None,
             clip=0.0):

    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)

        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [79]:
optimizer = AdamW(pt_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)

        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))

            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss

        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model,
        data_loader=train_data_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        step=step,
        print_every_step=EEVERY_EPOCH,
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader,
        clip=CLIP)

    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

    eval_y, eval_loss = eval_op(
        model=pt_model,
        data_loader=valid_data_loader,
        loss_fn=loss_fn)

    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)





Epochs... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/917 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/102 [00:00<?, ?it/s]

Training... :   0%|          | 0/917 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/102 [00:00<?, ?it/s]

Epoch: 2/3...Step: 1000...Train Loss: 0.376981...Train Acc: 0.834...Valid Loss: 0.598474...Valid Acc: 0.754...
Validation loss decreased (inf --> 0.598474).  Saving model ...


Evaluation... :   0%|          | 0/102 [00:00<?, ?it/s]

Training... :   0%|          | 0/917 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/102 [00:00<?, ?it/s]

Epoch: 3/3...Step: 2000...Train Loss: 0.267220...Train Acc: 0.902...Valid Loss: 0.700679...Valid Acc: 0.752...


Evaluation... :   0%|          | 0/102 [00:00<?, ?it/s]

In [80]:
def predict(model, comments, tokenizer, max_len=235, batch_size=16):
    data_loader = create_data_loader(comments, None, tokenizer, max_len, batch_size, None)

    predictions = []
    prediction_probs = []


    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [81]:
test_contents = test['content'].to_numpy()
preds, probs = predict(pt_model, test_contents, tokenizer, max_len=256)

print(preds.shape, probs.shape)

  0%|          | 0/114 [00:00<?, ?it/s]

(1811,) (1811, 2)


In [82]:
# epoch 3, clip=1.0, learning_rate=2e5, batchsize=16
y_test, y_pred = [label_list.index(label) for label in test['label'].values], preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=label_list))

F1: 0.7747124406409746

              precision    recall  f1-score   support

    negative       0.77      0.77      0.77       896
    positive       0.78      0.78      0.78       915

    accuracy                           0.77      1811
   macro avg       0.77      0.77      0.77      1811
weighted avg       0.77      0.77      0.77      1811



In [84]:
# save the model
import torch

# Save the model
torch.save(pt_model.state_dict(), '/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/pytorch_model_finetune_parsbert.pth')

# Save the tokenizer
tokenizer.save_pretrained('/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/')

('/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/tokenizer_config.json',
 '/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/special_tokens_map.json',
 '/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/vocab.txt',
 '/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/added_tokens.json')

### Loading the model for later

In [86]:
model = SentimentModel(config=config)
model = model.to(device)

In [87]:
# Load the saved model state dictionary
model.load_state_dict(torch.load('/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/pytorch_model_finetune_parsbert.pth'))
model.eval()  # Set the model to evaluation mode

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Datasets_Chapter2_phd/Sahamyab/bert-fa-base-uncased-sentiment-socialmedia-finetune-parsbert/')

In [88]:
model = model.to(device)

In [91]:
def sentiment_analysis(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the device
    with torch.no_grad():
        logits = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs['token_type_ids']
        )
    probabilities = F.softmax(logits, dim=1)
    predictions = torch.argmax(probabilities, dim=1)
    return predictions, probabilities

# Example texts to analyze
texts = ["به گزارش کدال نگر بورس24 ، شرکت پتروشیمی فسا در 12 ماهه منتهی به اسفند 99 برای هر سهم 15 ریال سود محقق کرد که به نسبت دوره قبل کاهش 35 درصدی داشته است..",
         "به گزارش بورس نیوز، شرکت داروسازی دکتر عبیدی از انتشار اوراق بدهی خبر داد.بر این اساس، این شرکت اعلام کرد، به منظور تامین بخشی از سرمایه در گردش مورد نیاز خود جهت خرید مواد اولیه مصرفی اوراق مرابحه منتشر می‌کند.",
         "سهام خودرو گند زد! حتما میریزه پایین"]

# Perform sentiment analysis
predictions, probabilities = sentiment_analysis(texts, model, tokenizer)

# Print the results
sentiment_labels = {0: "negative", 1: "positive"}
for text, prediction, probability in zip(texts, predictions, probabilities):
    print(f"Text: {text}\nLabel: {sentiment_labels[prediction.item()]}, Score: {probability.max().item()}\n")

Text: به گزارش کدال نگر بورس24 ، شرکت پتروشیمی فسا در 12 ماهه منتهی به اسفند 99 برای هر سهم 15 ریال سود محقق کرد که به نسبت دوره قبل کاهش 35 درصدی داشته است..
Label: positive, Score: 0.9951604008674622

Text: به گزارش بورس نیوز، شرکت داروسازی دکتر عبیدی از انتشار اوراق بدهی خبر داد.بر این اساس، این شرکت اعلام کرد، به منظور تامین بخشی از سرمایه در گردش مورد نیاز خود جهت خرید مواد اولیه مصرفی اوراق مرابحه منتشر می‌کند.
Label: positive, Score: 0.9960053563117981

Text: سهام خودرو گند زد! حتما میریزه پایین
Label: negative, Score: 0.9921733140945435

