In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

# Load data

In [2]:
from utils.pre_processing import combine_scraped_data
from config import DATA_PATHS
from utils.pre_processing import clean_scraped_data
from utils.pre_processing import labels_indexes_mapping

for phase in DATA_PATHS:
    metadata_name = DATA_PATHS[phase]['metadata']
    data_name = DATA_PATHS[phase]['data']
    combine_scraped_data(f'data/{data_name}', f'data/{metadata_name}', is_train=False)


train = clean_scraped_data('data/train_processed.xlsx')
test = clean_scraped_data('data/test_processed.xlsx')

# Create labels to indexes and indexes to labels dicts
label_to_idx, idx_to_label = labels_indexes_mapping(test)
# change the label to number
train.loc[:, 'labels'] = train.labels.apply(lambda x: label_to_idx.get(x))

In [3]:
train.head()

Unnamed: 0,labels,title
0,0.0,internship digital marketer
1,0.0,marketing executive
2,0.0,marketing staff
3,0.0,sales representations
4,0.0,marketing & business development executive


In [4]:
from datasets import TitlesDataset
from torch.utils.data import DataLoader

train_ds = TitlesDataset(train.title.tolist(), train.labels.tolist())

In [5]:
import torch
# PARAMETERS

# train parameters
batch_size = 8
num_epochs = 1
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'DEVICE: {device}')

# embedding parameters
max_length = 128

#optimizer parameters
lr = 5e-5

DEVICE: cuda


In [6]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)

In [7]:
from models import BERT
from transformers import AdamW

model = BERT(model_name='bert-base-uncased',num_labels=len(label_to_idx), max_length=max_length,device=device)

optimizer = AdamW(model.parameters(), lr=lr)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from trainer import Trainer
trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader,
                  num_epochs=num_epochs, device=device)

In [14]:
trainer.train()

  0%|          | 0/1933 [00:00<?, ?it/s]

In [15]:
labels, preds = trainer.evaluate(train_dataloader)

  0%|          | 0/1933 [00:00<?, ?it/s]

In [24]:
l = list(np.nan_to_num(np.array(labels), 0))

In [26]:
from sklearn.metrics import classification_report
print(classification_report(l, preds, zero_division=1))

              precision    recall  f1-score   support

         0.0       0.03      0.02      0.02       354
         1.0       0.24      0.90      0.38       575
         2.0       0.00      0.00      0.00       106
         3.0       0.21      0.69      0.33       247
         4.0       1.00      0.00      0.00        37
         5.0       1.00      0.00      0.00        60
         6.0       1.00      0.00      0.00        32
         7.0       1.00      0.00      0.00       203
         8.0       1.00      0.00      0.00       121
         9.0       0.00      0.00      0.00       131
        10.0       1.00      0.00      0.00        19
        11.0       1.00      0.00      0.00        17
        12.0       1.00      0.00      0.00        37
        13.0       1.00      0.00      0.00       110
        14.0       1.00      0.00      0.00        39
        15.0       1.00      0.00      0.00        37
        16.0       1.00      0.00      0.00        17
        17.0       1.00    

In [21]:
labels

[31.0,
 355.0,
 853.0,
 122.0,
 22.0,
 546.0,
 358.0,
 1042.0,
 129.0,
 1047.0,
 995.0,
 76.0,
 49.0,
 19.0,
 1190.0,
 73.0,
 298.0,
 132.0,
 759.0,
 76.0,
 98.0,
 95.0,
 1.0,
 nan,
 73.0,
 78.0,
 9.0,
 43.0,
 28.0,
 31.0,
 310.0,
 12.0,
 1093.0,
 30.0,
 171.0,
 161.0,
 527.0,
 458.0,
 53.0,
 329.0,
 30.0,
 46.0,
 884.0,
 73.0,
 88.0,
 621.0,
 7.0,
 738.0,
 376.0,
 799.0,
 1.0,
 415.0,
 408.0,
 298.0,
 104.0,
 49.0,
 23.0,
 220.0,
 266.0,
 363.0,
 34.0,
 30.0,
 13.0,
 193.0,
 1022.0,
 142.0,
 190.0,
 136.0,
 30.0,
 145.0,
 124.0,
 57.0,
 210.0,
 9.0,
 527.0,
 14.0,
 119.0,
 663.0,
 601.0,
 139.0,
 107.0,
 1.0,
 600.0,
 56.0,
 1084.0,
 251.0,
 1.0,
 213.0,
 4.0,
 103.0,
 104.0,
 121.0,
 12.0,
 nan,
 363.0,
 90.0,
 8.0,
 128.0,
 2.0,
 266.0,
 107.0,
 35.0,
 690.0,
 104.0,
 108.0,
 172.0,
 525.0,
 76.0,
 33.0,
 718.0,
 142.0,
 381.0,
 709.0,
 397.0,
 407.0,
 376.0,
 1.0,
 318.0,
 142.0,
 157.0,
 nan,
 90.0,
 103.0,
 468.0,
 107.0,
 7.0,
 0.0,
 78.0,
 32.0,
 3.0,
 37.0,
 359.0,
 104.0,
 1.