# Imports and settings

In [1]:
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

In [2]:
# Global parameters
MODEL_NAME = "distilbert-base-uncased"


# Load and process data

In [12]:
from utils.pre_processing import combine_scraped_data
from config import data_paths

for phase in data_paths:
    metadata_name = data_paths[phase]['metadata']
    data_name = data_paths[phase]['data']
    combine_scraped_data(f'data/{data_name}', f'data/{metadata_name}', is_train=False)

In [16]:
from utils.pre_processing import clean_scraped_data

train = clean_scraped_data('data/train_processed.xlsx')
test = clean_scraped_data('data/test_processed.xlsx')

Unnamed: 0,labels,title
0,marketing assistant,marketing
1,marketing assistant,marketing advisor
2,marketing assistant,admin marketing
3,marketing assistant,sales executive manager
4,marketing assistant,talent executive
5,marketing assistant,e-commerce senior executive (mandarin speaking)
6,marketing assistant,digital marketing & social media
7,marketing assistant,intern - digital marketing
8,marketing assistant,sales marketing development division
9,marketing assistant,marketing executive


In [5]:
from utils.pre_processing import labels_indexes_mapping
# Create labels to indexes and indexes to labels dicts
label_to_idx , idx_to_label = labels_indexes_mapping(test)
# change the label to number
test.loc[:, 'labels'] = test.labels.apply(lambda x: label_to_idx.get(x))


In [6]:
from utils.dataloader import create_tokenizer_dataloader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["title"], padding='max_length', max_length = 64, truncation=True)


train_dataloader = create_tokenizer_dataloader(test, tokenize_function)

  0%|          | 0/15463 [00:00<?, ?ex/s]

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(idx_to_label))
model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [8]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [9]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [10]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [11]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2901 [00:00<?, ?it/s]

In [139]:
output = tokenizer.decode(tokenized_data[3]['input_ids'].squeeze(), skip_special_tokens=True)
output

'sales executive manager'

In [13]:
# metric = load_metric("accuracy")
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    print(logits.shape)
    o = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
    la = batch['labels'].tolist()
    predictions = torch.argmax(logits, dim=-1)
    pred = predictions.tolist()
    for t, p, l in zip(o, pred, la):
        print(f'title: {t},\n label: {idx_to_label.get(p)}, \n real: {idx_to_label.get(l)}')
        print('\n')
    print('--------------')
#     metric.add_batch(predictions=predictions, references=batch["labels"])
#
# metric.compute()

torch.Size([16, 1369])
title: quranic teacher,
 label: early years teacher, 
 real: tutor


title: account staff / account assistant,
 label: accounting assistant, 
 real: bookkeeper


title: resort manager,
 label: restaurant manager, 
 real: accommodation manager


title: social media assistant,
 label: online marketer, 
 real: online marketer


title: project manager development,
 label: project manager, 
 real: project manager


title: transport planner ( thai speaker ),
 label: car and van delivery driver, 
 real: transport planner


title: fiber service & installer, kl # mal,
 label: engineering assistant, 
 real: ICT network technician


title: senior operational controller,
 label: management assistant, 
 real: financial controller


title: support & implementation manager,
 label: project manager, 
 real: ICT help desk manager


title: senior sales account manager ( wireline & logging service ),
 label: sales assistant, 
 real: sales account manager


title: lmw & shipping off

KeyboardInterrupt: 

idx_to_label