In [1]:
import pandas as pd

df = pd.read_csv("mbti_1.csv")

print(df.head())

   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [2]:
import re

def clean_text(text):

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'\@\w+|\#','', text)

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\d+', '', text)

    text = text.lower()
    return text

df['cleaned_text'] = df['posts'].apply(clean_text)

In [3]:
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['type'])

tokenizer = BertTokenizer.from_pretrained('uncased')

def encode_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# encoded_data = df['posts'].apply(encode_text)
encoded_data = df['cleaned_text'].apply(encode_text)
input_ids = torch.cat([item['input_ids'] for item in encoded_data], dim=0)
attention_masks = torch.cat([item['attention_mask'] for item in encoded_data], dim=0)
labels = torch.tensor(df['label'].values, dtype=torch.long)

In [4]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

dataset = TensorDataset(input_ids, attention_masks, labels)


batch_size = 16


train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [5]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained(
    'uncased',
    num_labels=16,
    output_attentions=False,
    output_hidden_states=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * 4

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



In [7]:
import torch
import numpy as np
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, epochs=4):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_attention_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_attention_mask, 
                            labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Training loss: {avg_train_loss}")


        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps = 0

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_attention_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_attention_mask)

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()

            preds_classes = [label_encoder.inverse_transform([pred])[0][0] for pred in preds.cpu().numpy()]
            true_classes = [label_encoder.inverse_transform([label])[0][0] for label in b_labels.cpu().numpy()]

            batch_accuracy = sum([1 if p == t else 0 for p, t in zip(preds_classes, true_classes)]) / len(true_classes)
            eval_accuracy += batch_accuracy
            nb_eval_steps += 1

            # eval_accuracy += (preds == b_labels).cpu().numpy().mean()
            # nb_eval_steps += 1

        print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps}")

In [9]:
train_model(model, train_dataloader, val_dataloader, optimizer, scheduler)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████████| 434/434 [17:32<00:00,  2.43s/it]


Epoch 1/4
Training loss: 2.150191197472234
Validation Accuracy: 0.8031618610747051


100%|████████████████████████████████████████████████████████████████████████████████| 434/434 [13:50<00:00,  1.91s/it]


Epoch 2/4
Training loss: 1.7216563886761116
Validation Accuracy: 0.825524246395806


100%|████████████████████████████████████████████████████████████████████████████████| 434/434 [08:54<00:00,  1.23s/it]


Epoch 3/4
Training loss: 1.43989931355973
Validation Accuracy: 0.8226572739187419


100%|████████████████████████████████████████████████████████████████████████████████| 434/434 [08:55<00:00,  1.23s/it]


Epoch 4/4
Training loss: 1.2398153904670943
Validation Accuracy: 0.8289646133682831


In [11]:
model.save_pretrained('mbti_bert_model')
tokenizer.save_pretrained('mbti_bert_tokenizer')

('mbti_bert_tokenizer\\tokenizer_config.json',
 'mbti_bert_tokenizer\\special_tokens_map.json',
 'mbti_bert_tokenizer\\vocab.txt',
 'mbti_bert_tokenizer\\added_tokens.json')