In [27]:
import torch
from trainer import Trainer
from transformers import AdamW
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [28]:
# python load data
from utils.pre_processing import combine_scraped_data
from config import DATA_PATHS
from utils.pre_processing import labels_indexes_mapping
from sklearn.model_selection import train_test_split

for phase in DATA_PATHS:
    metadata_name = DATA_PATHS[phase]['metadata']
    data_name = DATA_PATHS[phase]['data']
    combine_scraped_data(f'data/{data_name}', f'data/{metadata_name}', is_train=False)

In [29]:
from utils.pre_processing import clean_scraped_data_to_code
# TRAIN
train_base = clean_scraped_data_to_code('data/train_processed.xlsx')
# TEST
test = clean_scraped_data_to_code('data/test_processed.xlsx')

In [30]:
def clean_label_code(code: str) -> str:
    return code.split('.')[0][:-2]

train_base.loc[:, 'labels'] = train_base.labels.apply(clean_label_code)
test.loc[:, 'labels'] = test.labels.apply(clean_label_code)

# Data exploration

In [32]:
# Labels distribution
train_base.labels.value_counts()

24    2117
25    1648
21    1430
33    1426
52     932
31     839
12     555
23     494
83     480
93     480
14     398
72     372
51     303
43     291
13     286
42     275
35     274
94     271
41     246
26     243
81     220
91     210
75     191
44     185
34     171
22     155
74     146
32     141
82     109
53     107
71      98
92      85
54      75
11      66
73      51
96      45
61      31
95       5
62       4
01       3
02       3
03       2
Name: labels, dtype: int64

# Create train, val, test

In [9]:
# # apply the label
# train_base.labels = train_base.labels.apply(lambda l: 'other' if l in low_freq_labels else l)
# test.labels = test.labels.apply(lambda l: 'other' if l in low_freq_labels else l)

In [33]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_base, test_size=0.25, shuffle=True, random_state=11)
train.shape, val.shape

((11597, 2), (3866, 2))

In [34]:
from utils.pre_processing import labels_indexes_mapping
label_to_idx, idx_to_label = labels_indexes_mapping(train_base)

In [35]:
# convert all string labels to number in order to train
from utils.pre_processing import labels_indexes_mapping
label_to_idx, idx_to_label = labels_indexes_mapping(train_base)

train.loc[:, 'labels'] = train.loc[:, 'labels'].apply(lambda x: label_to_idx.get(x))
val.loc[:, 'labels'] = val.labels.apply(lambda x: label_to_idx.get(x))
test.loc[:, 'labels'] = test.labels.apply(lambda x: label_to_idx.get(x))
train.shape, val.shape, test.shape

((11597, 2), (3866, 2), (15463, 2))

In [36]:
# convert the data to pytorch dataset
from datasets import TitlesDataset

train_ds = TitlesDataset(train.title.tolist(), train.labels.tolist())
val_ds = TitlesDataset(val.title.tolist(), val.labels.tolist())
test_ds = TitlesDataset(test.title.tolist(), test.labels.tolist())

## Evaluation

In [37]:
from torch.utils.data import DataLoader
batch_size = 16

train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_ds, shuffle=True, batch_size=batch_size)
test_dataloader = DataLoader(test_ds, shuffle=True, batch_size=batch_size)

# Hyperparameter optimization

In [48]:
import optuna
from sklearn.metrics import f1_score
from models import BERT, tBERT
from utils.pre_processing import create_titles_corpus

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
N_TRAILS = 1

In [50]:
def bert_objective(trail):
    # Params ranges:
    embeddings_length = trail.suggest_categorical('embeddings_length', [32, 64])
    lr = trail.suggest_loguniform('lr', 1e-6, 1e-3)

    model = BERT(model_name='bert-base-uncased',num_labels=len(label_to_idx),
             max_length=embeddings_length, device=DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr)

    trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=1, device=DEVICE)

    trainer.train()
    labels, preds = trainer.evaluate(val_dataloader)
    f1 = f1_score(labels, preds, average='micro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(bert_objective, n_trials=N_TRAILS)

trail_ = study.best_trial
with open('results/BERT_best_params.json', 'w') as f:
    d = dict(trail_.params)
    d['f1'] = trail_.values[0]
    json.dump(d, f, indent=4)
print(f'BEST TRAIL:\n f1:  {trail_.values}\nparams: {trail_.params}')

[32m[I 2022-08-06 17:05:23,072][0m A new study created in memory with name: no-name-2a7c45a1-5fb0-4d79-8c74-6be52fa80f92[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

  0%|          | 0/725 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

[32m[I 2022-08-06 17:06:57,909][0m Trial 0 finished with value: 0.4477496120020693 and parameters: {'embeddings_length': 64, 'lr': 0.0008525880501640893}. Best is trial 0 with value: 0.4477496120020693.[0m


TypeError: 'list' object is not a mapping

In [49]:
def tbert_objective(trail):
    # PARAMS SETTING
    # Embedding
    embeddings_length = trail.suggest_categorical('embeddings_length', [32, 64])
    # General Model
    lr = trail.suggest_loguniform('lr', 1e-6, 1e-3)
    # LDA
    n_topics = trail.suggest_int('n_topics', 50, 100)
    alpha = trail.suggest_float('alpha', 1/50, 10)


    corpus = create_titles_corpus(train_dataloader)
    model = tBERT(corpus, model_name='bert-base-uncased',num_labels=len(label_to_idx),
             max_length=embeddings_length, n_topics=n_topics, alpha=alpha, device=DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr)

    trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=1, device=DEVICE)

    trainer.train()
    labels, preds = trainer.evaluate(val_dataloader)
    f1 = f1_score(labels, preds, average='micro')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(tbert_objective, n_trials=N_TRAILS)

trail_ = study.best_trial
with open('results/tBERT_best_params.json', 'w') as f:
    d = dict(trail_.params)
    d['f1'] = trail_.values[0]
    json.dump(d, f, indent=4)
print(f'BEST TRAIL:\n f1:  {trail_.values}\nparams: {trail_.params}')

[32m[I 2022-08-06 16:58:21,023][0m A new study created in memory with name: no-name-92186d83-eabf-4380-aa72-d595a9966cb1[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/725 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

[32m[I 2022-08-06 17:02:17,354][0m Trial 0 finished with value: 0.2909984480082773 and parameters: {'embeddings_length': 64, 'lr': 5.639469575148213e-05, 'n_topics': 69, 'alpha': 8.65564170855882}. Best is trial 0 with value: 0.2909984480082773.[0m


BEST TRAIL:
 f1:  [0.2909984480082773]
params: {'embeddings_length': 64, 'lr': 5.639469575148213e-05, 'n_topics': 69, 'alpha': 8.65564170855882}
