**Fatemeh Salehi Rizi**

**BERT model for forum post classification** 

In [1]:
#!pip3 install transformers
#!pip3 install torch


import re
import os
import pandas as pd

import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer


In [2]:
from google.colab import drive

drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [3]:
df_data=pd.read_parquet('/content/gdrive/MyDrive/data/psoriasis_all_posts.parquet', engine='pyarrow')


df_data.head(5)

Unnamed: 0,post_id,text,user,timestamp,subject_id,forum_id
96315,170079,zunehmend oft höre ich über die problematik d...,Annamaria,2010-01-01 21:48:37,13338,4-community
96316,170082,"Hallo Annamaria, herzlich Willkommen hier im...",Fischi,2010-01-01 21:48:37,13338,4-community
96317,170210,"hallo, Annamaria - ich wünsche dir wirklich ...",Bibi,2010-01-01 21:48:37,13338,4-community
30501,170251,Auch ich wünsche euch ein schönes und glückli...,BlackLady,2010-01-02 10:13:06,3969,1-allgemeines
30498,170019,Ich wünsche Allen Frohes Neues Jahr GB Pics...,Bernd-57,2010-01-02 10:13:06,3969,1-allgemeines


In [4]:


from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df_data["label_id"] = lb_make.fit_transform(df_data["forum_id"])
df_data["label_id"].unique()
df_data[["forum_id", "label_id"]].head(5)


Unnamed: 0,forum_id,label_id
96315,4-community,3
96316,4-community,3
96317,4-community,3
30501,1-allgemeines,0
30498,1-allgemeines,0


In [5]:

def clean_post(text):
    text= text.strip().lower()
    text= text.replace("&nbsp;", " ")
    text= re.sub(r'<br(\s\/)?>', ' ', text)
    text= re.sub(r' +', ' ', text)  # merge multiple spaces into one

    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text= re.sub(r'#', '', text)
    text= re.sub(r'@', '', text)
    text = re.sub('-', ' ', text)
    text = re.sub('<br\s?\/>|<br>', "", text)
    post = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b",'',text)

    return post


In [6]:
post_list=[]
label_list=[]

for index, row in df_data.iterrows():
    

    post_list.append(clean_post(row['text']))
    label_list.append(row['label_id'])

print(len(post_list), len(label_list))    

55888 55888


In [7]:

X= post_list
y=label_list


X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [8]:
PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index
BATCH_SIZE = 64
LEARNING_RATE_MODEL = 1e-5
LEARNING_RATE_CLASSIFIER = 1e-3
WARMUP_STEPS = 0
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
SEED = 42
NO_CUDA = False

In [9]:
def rpad(array, n):
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    return array + ([0] * extra)


def convert_to_embedding(tokenizer, sentences_with_labels):
    for sentence, label in sentences_with_labels:
        tokens = tokenizer.tokenize(sentence)
        tokens = tokens[:124]
        bert_sent = rpad(tokenizer.convert_tokens_to_ids(["CLS"] + tokens + ["SEP"]), n=128)
        yield torch.tensor(bert_sent), torch.tensor(label, dtype=torch.int64)



def get_data(tokenizer, sampler=RandomSampler, train=True):
  
    if train:
       sentences_with_labels = zip(X_train, y_train)
       

    if not train:
       sentences_with_labels = zip(X_test, y_test)

    dataset = list(convert_to_embedding(tokenizer, sentences_with_labels))
    sampler_func = sampler(dataset) if sampler is not None else None
    dataloader = DataLoader(dataset, sampler=sampler_func, batch_size=BATCH_SIZE)

    return dataloader


In [10]:
class Transformers:
    
    
    model = None

    def __init__(self, tokenizer):

        self.pad_token_label_id = PAD_TOKEN_LABEL_ID

        torch.cuda.empty_cache()
      
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        torch.cuda.empty_cache()

        self.tokenizer = tokenizer

    def predict(self, sentence):
        if self.model is None or self.tokenizer is None:
            self.load()

        embeddings = list(convert_to_embedding([(sentence, -1)]))
        preds = self._predict_tags_batched(embeddings)
        return preds

    def evaluate(self, dataloader):
        from sklearn.metrics import classification_report
        y_pred = self._predict_tags_batched(dataloader)
      
        score = classification_report(y_test, y_pred)

        fsc= f1_score(y_test, y_pred, average='micro')
        acc=accuracy_score(y_test, y_pred)

       

        print('\n')

        print('********************************************************************')
        print('F1 score: ', fsc)
        print('Accuracy:', acc)

        print('********************************************************************')

        print(score)
       

    def _predict_tags_batched(self, dataloader):
        preds = []
        self.model.eval()
        for batch in tqdm(dataloader, desc="Computing NER tags"):
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                outputs = self.model(batch[0])
                _, is_neg = torch.max(outputs[0], 1)
                preds.extend(is_neg.cpu().detach().numpy())

        return preds

    def train(self, dataloader, model, epochs):
        assert self.model is None  # make sure we are not training after load() command
        model.to(self.device)

        print('our processor ....', self.device)


        self.model = model

        t_total = len(dataloader) // GRADIENT_ACCUMULATION_STEPS * epochs

        # Prepare optimizer and schedule 
        optimizer_grouped_parameters = [
            {"params": model.bert.parameters(), "lr": LEARNING_RATE_MODEL},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("Training on %d examples"% len(dataloader))
        print("Num Epochs = %d"% epochs)
        print("Total optimization steps = %d"% t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(epochs, desc="Epoch")
        self._set_seed()
        for _ in train_iterator:
            epoch_iterator = tqdm(dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.device) for t in batch)
                outputs = model(batch[0], labels=batch[1])
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers 

                if GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / GRADIENT_ACCUMULATION_STEPS

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    model.zero_grad()
                    global_step += 1

        self.model = model

        return global_step, tr_loss / global_step

    def _set_seed(self):
        torch.manual_seed(SEED)
        if self.device == 'gpu':
            torch.cuda.manual_seed_all(SEED)

    def load(self, model_dir='weights/'):
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)

In [11]:

def train(epochs=20, output_dir="weights/"):

    num_labels = 4 

    CUDA_LAUNCH_BLOCKING=1 

    config = BertConfig.from_pretrained('bert-base-multilingual-uncased', num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', config=config)

    dataloader = get_data(tokenizer, train=True)
    predictor = Transformers(tokenizer)
    predictor.train(dataloader, model, epochs)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def evaluate(model_dir="weights/"):
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)

    dataloader = get_data(tokenizer, train=False, sampler=None)
    predictor = Transformers(tokenizer)
    predictor.load(model_dir=model_dir)
    predictor.evaluate(dataloader)




path = '/content/gdrive/My Drive/weights/'

#os.makedirs(path, exist_ok=True)
train(epochs=1, output_dir=path)
evaluate(model_dir=path)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

our processor .... cuda:0
***** Running training *****
Training on 586 examples
Num Epochs = 1
Total optimization steps = 586


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 1/586 [00:02<26:15,  2.69s/it][A
Iteration:   0%|          | 2/586 [00:05<24:06,  2.48s/it][A
Iteration:   1%|          | 3/586 [00:07<23:25,  2.41s/it][A
Iteration:   1%|          | 4/586 [00:09<23:04,  2.38s/it][A
Iteration:   1%|          | 5/586 [00:12<22:49,  2.36s/it][A
Iteration:   1%|          | 6/586 [00:14<22:40,  2.35s/it][A
Iteration:   1%|          | 7/586 [00:16<22:37,  2.35s/it][A
Iteration:   1%|▏         | 8/586 [00:19<22:35,  2.35s/it][A
Iteration:   2%|▏         | 9/586 [00:21<22:30,  2.34s/it][A
Iteration:   2%|▏         | 10/586 [00:23<22:29,  2.34s/it][A
Iteration:   2%|▏         | 11/586 [00:26<22:28,  2.35s/it][A
Iteration:   2%|▏         | 12/586 [00:28<22:25,  2.34s/it][A
Iteration:   2%|▏         | 13/586 [00:30<22:25,  2.35s/it][A
Iteration:   2%|▏         | 14/586 [00:33<22:22,  2.35s/it][A
Iteration:   3%|▎         | 15/586 [00:35<22:20,  2.35s/it][A
Iteration:   3%|▎ 



********************************************************************
F1 score:  0.5878876599436131
Accuracy: 0.5878876599436131
********************************************************************
              precision    recall  f1-score   support

           0       0.58      0.65      0.62      7408
           1       0.52      0.54      0.53      3739
           2       0.57      0.57      0.57      1181
           3       0.65      0.55      0.60      6116

    accuracy                           0.59     18444
   macro avg       0.58      0.58      0.58     18444
weighted avg       0.59      0.59      0.59     18444




