<a href="https://colab.research.google.com/github/felixzhao/title_catgories_classification/blob/main/V7_sentence_bert_NN_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install sentence-transformers package

In [23]:
! pip install torch sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# import packages

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics

# init model

In [25]:
torch.cuda.is_available()

False

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "sentence-transformers/bert-base-nli-mean-tokens"
sentence_transformer = SentenceTransformer(model_name).to(device)

# load  data

In [27]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "drive/MyDrive/trademe_data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
raw_path = f"{root_path}raw.csv"

In [29]:
raw_df = pd.read_csv(raw_path)
raw_df.head(1)

Unnamed: 0.1,Unnamed: 0,x_title,y_cat_id,cat_1,cat_2,cat_3,x_title_feature
0,0,unqualified asbestos remover,5192,trades & services,labourers,labourers,unqualifi asbesto remov


In [30]:
len(raw_df.y_cat_id.unique())

195

In [31]:
raw_df = raw_df[[ 'x_title_feature', 'cat_2']]
print(len(raw_df))
raw_df.head(10)

20000


Unnamed: 0,x_title_feature,cat_2
0,unqualifi asbesto remov,labourers
1,senior test analyst,testing
2,ict trainer supervisor,tutoring & training
3,automot specialist multi facet posit,technicians
4,busi analyst,business & systems analysts
5,experienc excav oper truck driver,machine operators
6,system account,accountants
7,industri electrician,electrical
8,chef,chefs
9,project architect,architects


In [32]:
from sklearn.preprocessing import LabelEncoder
 
# Creating a instance of label Encoder.
le = LabelEncoder()
 
# Using .fit_transform function to fit label
# encoder and return encoded label
label = LabelEncoder().fit_transform(raw_df['cat_2'])
 
raw_df['y'] = label
# # printing label
# print(len(raw_df.y.unique()))
# raw_df

In [33]:
num_classes = len(raw_df.y.unique())
num_classes

161

In [34]:
raw_df.y.unique()

array([ 86, 150, 155, 145,  21,  90,   1,  54,  28,   9,  85,  14,  89,
        26, 126, 105, 102,  34, 101,   5,  75,  19,  60, 111,  27,  92,
        47,  51, 136, 108, 114, 115,  82,  17, 138,  25, 135, 128,  13,
       120,  18,  76, 140, 112,  87,  98,  53, 141, 124,  58, 131,  78,
        94,  33, 156, 158, 143,  81, 129,  64,  30, 122,  20, 118,  23,
       123, 157, 132,  46,  59,  74, 113,  96,  29,  70,  38,  43,  99,
        91,  88,  49, 127,   2, 107,  11,  97, 146, 134, 117,  50,  57,
        95, 144,   6,  83, 148,  61,  22,  73, 137, 133, 139,  68,  41,
       110, 100, 104, 142,  62,  44, 152,  77,  67,  39, 159, 130,   8,
         7, 119,  80,  63, 103, 147, 125, 149,  93, 151, 106,  52,  79,
        71, 160,  37,  65,  69,   3,  72,   4,   0,  40,  10, 154,  15,
       116,  66,  36,  12,  45,  16,  42, 121,  48,  31, 153,  55,  32,
        24, 109,  84,  35,  56])

# sentence bert NN

## classes

## Define the neural network structure for multi-class classification

In [35]:
class MultiClassClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MultiClassClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [46]:
class MultiClassClassifierHandler:
    def __init__(self):
        self.classifier = MultiClassClassifier(768, num_classes).to(device)

    def train(self, X_train, y_train):
        train_embeddings = sentence_transformer.encode(X_train, convert_to_tensor=True).to(device)
        train_labels = torch.tensor(y_train).to(device)
        # Hyperparameters
        epochs = 10
        learning_rate = 0.005
        batch_size = 16

        # Loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.classifier.parameters(), lr=learning_rate)

        # Training loop
        for epoch in range(epochs):
            for batch_idx in range(0, len(train_embeddings), batch_size):
                batch_embeddings = train_embeddings[batch_idx:batch_idx+batch_size]
                batch_labels = train_labels[batch_idx:batch_idx+batch_size]
                
                optimizer.zero_grad()
                outputs = self.classifier(batch_embeddings)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()

            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    def predict(self, text_list):
        with torch.no_grad():
            sentence_embeddings = sentence_transformer.encode(text_list, convert_to_tensor=True).to(device)
            probabilities = self.classifier(sentence_embeddings)
            max_probs, predicted_classes = torch.max(probabilities, 1)
            return predicted_classes.tolist(), max_probs.tolist()

## split data

In [47]:
X_train, X_test, y_train, y_test = train_test_split(raw_df['x_title_feature'], raw_df['y'], test_size=0.33, random_state=42)

In [48]:
train_sentences = X_train.reset_index(drop=True)
train_labels = y_train.reset_index(drop=True)
test_sentences = X_test.reset_index(drop=True)
test_labels = y_test.reset_index(drop=True)

In [49]:
# train_embeddings = sentence_transformer.encode(train_sentences, convert_to_tensor=True).to(device)
# train_labels = torch.tensor(train_labels).to(device)

# Train model

In [50]:
handler = MultiClassClassifierHandler()
handler.train(train_sentences, train_labels)

Epoch 1/10, Loss: 5.092020511627197
Epoch 2/10, Loss: 5.092020511627197
Epoch 3/10, Loss: 5.092020511627197
Epoch 4/10, Loss: 5.092020511627197
Epoch 5/10, Loss: 5.092020511627197
Epoch 6/10, Loss: 5.092020511627197
Epoch 7/10, Loss: 5.092020511627197
Epoch 8/10, Loss: 5.092020511627197
Epoch 9/10, Loss: 5.092020511627197
Epoch 10/10, Loss: 5.092020511627197


# Eval model

In [51]:
X_test = test_sentences.values.tolist()
y_test = test_labels.values.tolist()

In [52]:
y_pred, y_prob =  handler.predict(X_test)

In [53]:
metrics.accuracy_score(y_test, y_pred)

0.037575757575757575

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.00      0.00      0.00       105
           2       0.00      0.00      0.00        30
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00       274
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00        31
           8       0.00      0.00      0.00         7
           9       0.00      0.00      0.00        36
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00        36
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00        82
          14       0.00      0.00      0.00        65
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00         4
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
