<a href="https://colab.research.google.com/github/felixzhao/title_catgories_classification/blob/main/JTC_V5_sentence_bert_NN_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [59]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [60]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Load the pre-trained Sentence-BERT model and tokenizer
model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'
# model_name = 'openai-gpt'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [74]:
# device = torch.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load  data

In [75]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "drive/MyDrive/trademe_data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
raw_path = f"{root_path}raw.csv"

In [76]:
raw_df = pd.read_csv(raw_path)
raw_df.head(1)

Unnamed: 0.1,Unnamed: 0,x_title,y_cat_id,cat_1,cat_2,cat_3,x_title_feature
0,0,unqualified asbestos remover,5192,trades & services,labourers,labourers,unqualifi asbesto remov


In [89]:
raw_df = raw_df[[ 'x_title_feature', 'y_cat_id']] #[:1000]
print(len(raw_df))
raw_df.head(10)

100


Unnamed: 0,x_title_feature,y_cat_id
0,unqualifi asbesto remov,5192
1,senior test analyst,5123
2,ict trainer supervisor,6894
3,automot specialist multi facet posit,5197
4,busi analyst,5114
5,experienc excav oper truck driver,6891
6,system account,6884
7,industri electrician,5188
8,chef,5100
9,project architect,5034


In [90]:
from sklearn.preprocessing import LabelEncoder
 
# Creating a instance of label Encoder.
le = LabelEncoder()
 
# Using .fit_transform function to fit label
# encoder and return encoded label
label = LabelEncoder().fit_transform(raw_df['y_cat_id'])
 
raw_df['y'] = label
# # printing label
# print(len(raw_df.y.unique()))
# raw_df

In [91]:
num_classes = len(raw_df.y.unique())
num_classes

64

# Workable NN

In [118]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Load pre-trained Sentence-BERT model and tokenizer
model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'
# model_name = 'sentence-transformers/stsb-bert-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [119]:
# Load and preprocess your data
data = raw_df
text_data = data['x_title_feature'].values.tolist()
label_data = data['y'].values.tolist()

In [126]:

# Define classification model architecture
class SentenceBERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentenceBERTClassifier, self).__init__()
        self.bert = model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(model.config.hidden_size, num_classes),
            nn.Softmax(dim=1)
)


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# # Load and preprocess data
# data = raw_df
# text_data = data['feature'].values.tolist()
# label_data = data['label'].values.tolist()

# Convert text data to Sentence-BERT embeddings
input_ids = []
attention_masks = []
for text in text_data:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_masks[idx],
                'labels': self.labels[idx]}

dataset = CustomDataset(input_ids, attention_masks, label_data)




In [127]:
# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create PyTorch dataloaders
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)



In [128]:
# Initialize classifier and optimizer
classifier = SentenceBERTClassifier(num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-7)
criterion = nn.CrossEntropyLoss()



In [129]:
val_loss_list = []
val_acc_list = []

In [132]:
# Train the classifier
num_epochs = 100
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        logits = classifier(input_ids=input_ids, attention_mask=attention_masks)
        # print(logits, labels)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        val_loss = []
        val_accuracy = []
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = classifier(input_ids=input_ids, attention_mask=attention_masks)
            loss = criterion(logits, labels)
            val_loss.append(loss.item())
            predictions = torch.argmax(logits, axis=-1)
            accuracy = torch.mean((predictions == labels).float())
            val_accuracy.append(accuracy.item())
        avg_val_loss = sum(val_loss) / len(val_loss)
        avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)
        print(f'Epoch {epoch+1}: Validation Loss = {avg_val_loss:.2f}, Validation Accuracy = {avg_val_accuracy:.2f}')
        val_loss_list.append(avg_val_loss)
        val_acc_list.append(avg_val_accuracy)

Epoch 1: Validation Loss = 4.16, Validation Accuracy = 0.00
Epoch 2: Validation Loss = 4.16, Validation Accuracy = 0.00
Epoch 3: Validation Loss = 4.16, Validation Accuracy = 0.05
Epoch 4: Validation Loss = 4.16, Validation Accuracy = 0.00
Epoch 5: Validation Loss = 4.16, Validation Accuracy = 0.00
Epoch 6: Validation Loss = 4.16, Validation Accuracy = 0.00


KeyboardInterrupt: ignored