In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import transformers
%load_ext autoreload
%autoreload 2

transformers.logging.set_verbosity_error()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from utils.seed import set_seed
set_seed(17)

# Load the data

In [4]:
from utils.data_processing import prepre_data_to_model
from utils.data_processing import clean_text
train_base = prepre_data_to_model('data/semeval8/full_meta_en_train.csv', 'data/semeval8/en_train.csv', cutoff=3, col_text_to_use='title')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_df['labels'][df['Overall'] >= cutoff] = 1


In [5]:
from nltk.corpus import stopwords
from cleantext import clean

def clean_text(s):
    s = s.lower()
    s = clean(text=s)
    s = " ".join([word for word in s.split(' ') if word not in stopwords.words('english')])
    return s

In [6]:
for col in ['text1', 'text2']:
    train_base.loc[:,col] = train_base[col].apply(clean_text)

In [7]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_base, test_size=0.2, shuffle=True, random_state=11)
train.shape, val.shape

((1052, 3), (263, 3))

In [8]:
# convert the data to SentencesDataset

from datasets import SentencesDataset

train_ds = SentencesDataset(train.text1.tolist(), train.text1.tolist(), train.labels.tolist())
val_ds = SentencesDataset(val.text1.tolist(), val.text1.tolist(), val.labels.tolist())


# Train

In [9]:
# import torch
# from models import tBERT
# from transformers import AdamW
#
# # PARAMETERS
#
# # train parameters
# batch_size = 16
# num_epochs = 1
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#
# # lda parameters
# n_topics = 70
#
# # embedding parameters
# max_length = 256
#
# #optimizer parameters
# lr = 5e-5

In [10]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_ds, shuffle=True, batch_size=16)

In [11]:
# from utils.data_processing import create_sentences_corpus
#
# corpus = create_sentences_corpus(train_dataloader)
# model = tBERT(corpus, n_topics=n_topics, max_length=max_length, device=device)
#
# optimizer = AdamW(model.parameters(), lr=lr)

In [12]:
# from trainer import Trainer
# trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=num_epochs, device=device)

In [13]:
# trainer.train()

In [14]:
# f1_train = trainer.evaluate(train_dataloader)
# print(f'{f1_train=}')

In [29]:
from utils.data_processing import create_sentences_corpus
import torch
from models import tBERT
from transformers import AdamW
from trainer import Trainer
import optuna
import json
import os

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
NUM_EPOCHS = 6
N_TRAILS = 40

def tbert_objective(trail):
    # PARAMS SETTING
    # Embedding
    embeddings_length = trail.suggest_categorical('embeddings_length', [256, 512])
    # General Model
    lr = trail.suggest_loguniform('lr', 1e-6, 1e-3)
    # LDA
    n_topics = trail.suggest_int('n_topics', 50, 500)
    alpha = trail.suggest_float('alpha', 1/50, 10)


    corpus = create_sentences_corpus(train_dataloader)
    model = tBERT(corpus, model_name='bert-base-uncased',num_labels=2,
             max_length=embeddings_length, n_topics=n_topics, alpha=alpha, device=DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr)

    trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=NUM_EPOCHS, device=DEVICE)

    trainer.train()
    f1 = trainer.evaluate(val_dataloader)
    return f1

if not os.path.isfile('results/tBERT_best_params.json'):

    study = optuna.create_study(direction='maximize')
    study.optimize(tbert_objective, n_trials=N_TRAILS)

    trail_ = study.best_trial
    with open('results/tBERT_best_params.json', 'w') as f:
        d = dict(trail_.params)
        d['f1'] = trail_.values[0]
        json.dump(d, f, indent=4)
    print(f'BEST TRAIL:\n f1:  {trail_.values}\nparams: {trail_.params}')

[32m[I 2022-08-13 22:33:01,224][0m A new study created in memory with name: no-name-ff7fac77-5bca-4b48-85be-1066cd38a79d[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-13 22:53:04,521][0m Trial 0 finished with value: 0.6941176470588235 and parameters: {'embeddings_length': 256, 'lr': 0.0003004260828196086, 'n_topics': 409, 'alpha': 9.713753150857668}. Best is trial 0 with value: 0.6941176470588235.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-13 23:11:11,881][0m Trial 1 finished with value: 0.7241379310344827 and parameters: {'embeddings_length': 512, 'lr': 0.00014058806623887585, 'n_topics': 136, 'alpha': 1.4963001814449983}. Best is trial 1 with value: 0.7241379310344827.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-13 23:29:20,294][0m Trial 2 finished with value: 0.7217630853994491 and parameters: {'embeddings_length': 512, 'lr': 8.377509659876646e-05, 'n_topics': 78, 'alpha': 8.513895211174978}. Best is trial 1 with value: 0.7241379310344827.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-13 23:47:33,192][0m Trial 3 finished with value: 0.7217630853994491 and parameters: {'embeddings_length': 256, 'lr': 3.440615632568805e-05, 'n_topics': 90, 'alpha': 7.225009742666275}. Best is trial 1 with value: 0.7241379310344827.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 00:05:48,374][0m Trial 4 finished with value: 0.7272727272727273 and parameters: {'embeddings_length': 512, 'lr': 0.00010731976640401343, 'n_topics': 62, 'alpha': 7.561834261404589}. Best is trial 4 with value: 0.7272727272727273.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 00:24:51,773][0m Trial 5 finished with value: 0.72 and parameters: {'embeddings_length': 256, 'lr': 0.000647617858598015, 'n_topics': 454, 'alpha': 3.0721567702868566}. Best is trial 4 with value: 0.7272727272727273.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 00:43:34,232][0m Trial 6 finished with value: 0.7282913165266107 and parameters: {'embeddings_length': 256, 'lr': 0.00010524774237092511, 'n_topics': 181, 'alpha': 0.8352545807331763}. Best is trial 6 with value: 0.7282913165266107.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 01:02:47,470][0m Trial 7 finished with value: 0.7598944591029024 and parameters: {'embeddings_length': 256, 'lr': 3.3850264258274384e-05, 'n_topics': 341, 'alpha': 5.643702104432463}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 01:22:17,675][0m Trial 8 finished with value: 0.7293447293447294 and parameters: {'embeddings_length': 256, 'lr': 0.0002502924734945777, 'n_topics': 402, 'alpha': 2.7236675191941564}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 01:41:50,308][0m Trial 9 finished with value: 0.72 and parameters: {'embeddings_length': 512, 'lr': 4.907345347328649e-05, 'n_topics': 367, 'alpha': 7.231033150368462}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 02:01:12,790][0m Trial 10 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 4.2982259862118385e-06, 'n_topics': 252, 'alpha': 4.942100334458563}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 02:20:39,386][0m Trial 11 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 4.430517901105331e-06, 'n_topics': 275, 'alpha': 4.966109805611651}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 02:40:17,717][0m Trial 12 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 7.533937351970317e-06, 'n_topics': 275, 'alpha': 4.795168247606659}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 03:00:02,377][0m Trial 13 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 1.1522994494886272e-06, 'n_topics': 327, 'alpha': 5.78849806293999}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 03:20:07,971][0m Trial 14 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 1.0106534859870865e-05, 'n_topics': 230, 'alpha': 3.8690785921430457}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 03:39:53,177][0m Trial 15 finished with value: 0.7553444180522565 and parameters: {'embeddings_length': 256, 'lr': 1.7415901169778125e-05, 'n_topics': 324, 'alpha': 5.836912696169797}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 03:59:27,555][0m Trial 16 finished with value: 0.7571428571428571 and parameters: {'embeddings_length': 256, 'lr': 1.7438865010717146e-05, 'n_topics': 320, 'alpha': 6.603920974933399}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 04:19:24,936][0m Trial 17 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 1.5767774498672193e-05, 'n_topics': 495, 'alpha': 6.327604160723418}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 04:39:19,971][0m Trial 18 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 512, 'lr': 2.132443228695416e-06, 'n_topics': 323, 'alpha': 9.17651655827257}. Best is trial 7 with value: 0.7598944591029024.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 04:59:01,233][0m Trial 19 finished with value: 0.7658536585365855 and parameters: {'embeddings_length': 256, 'lr': 2.319098338283166e-05, 'n_topics': 375, 'alpha': 8.205941049502012}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 05:18:51,829][0m Trial 20 finished with value: 0.7466666666666667 and parameters: {'embeddings_length': 256, 'lr': 3.0312248305507033e-05, 'n_topics': 386, 'alpha': 8.500540073167956}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 05:38:36,844][0m Trial 21 finished with value: 0.7607655502392345 and parameters: {'embeddings_length': 256, 'lr': 2.1329400796184363e-05, 'n_topics': 348, 'alpha': 6.362344710761342}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 05:58:24,454][0m Trial 22 finished with value: 0.7272727272727273 and parameters: {'embeddings_length': 256, 'lr': 4.587881427129855e-05, 'n_topics': 434, 'alpha': 7.960260188096836}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 06:18:09,841][0m Trial 23 finished with value: 0.7570332480818415 and parameters: {'embeddings_length': 256, 'lr': 2.636726551025687e-05, 'n_topics': 346, 'alpha': 6.498028157808785}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 06:37:41,063][0m Trial 24 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 7.603531590513733e-06, 'n_topics': 223, 'alpha': 5.644169381060152}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 06:57:31,506][0m Trial 25 finished with value: 0.7130919220055711 and parameters: {'embeddings_length': 256, 'lr': 6.108433065536728e-05, 'n_topics': 462, 'alpha': 4.013405801160857}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 07:17:10,657][0m Trial 26 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 1.2279730255533144e-05, 'n_topics': 365, 'alpha': 9.95276348598766}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 07:36:46,914][0m Trial 27 finished with value: 0.7566265060240964 and parameters: {'embeddings_length': 512, 'lr': 2.193762317085549e-05, 'n_topics': 305, 'alpha': 8.309536705908585}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 07:56:32,096][0m Trial 28 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 4.31352958370428e-06, 'n_topics': 425, 'alpha': 9.023970061320359}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 08:16:08,737][0m Trial 29 finished with value: 0.6975308641975309 and parameters: {'embeddings_length': 256, 'lr': 0.000281077438013284, 'n_topics': 388, 'alpha': 7.224365281209604}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 08:29:02,199][0m Trial 30 finished with value: 0.6994219653179192 and parameters: {'embeddings_length': 256, 'lr': 0.00017767267407820328, 'n_topics': 291, 'alpha': 4.155442526908293}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 08:38:17,401][0m Trial 31 finished with value: 0.7405405405405405 and parameters: {'embeddings_length': 256, 'lr': 3.5623608256136296e-05, 'n_topics': 356, 'alpha': 6.481842246677114}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 08:47:31,964][0m Trial 32 finished with value: 0.7031700288184439 and parameters: {'embeddings_length': 256, 'lr': 6.316214755336677e-05, 'n_topics': 303, 'alpha': 6.591891990086856}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 08:56:36,000][0m Trial 33 finished with value: 0.7571428571428571 and parameters: {'embeddings_length': 256, 'lr': 1.8017003966753118e-05, 'n_topics': 348, 'alpha': 5.529975266978168}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:04:55,930][0m Trial 34 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 512, 'lr': 1.0552701059492873e-05, 'n_topics': 410, 'alpha': 7.883327062917928}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:13:16,526][0m Trial 35 finished with value: 0.7535545023696683 and parameters: {'embeddings_length': 256, 'lr': 6.678915562059049e-06, 'n_topics': 377, 'alpha': 5.5414730413171265}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:21:45,817][0m Trial 36 finished with value: 0.7329545454545454 and parameters: {'embeddings_length': 512, 'lr': 7.835729181351143e-05, 'n_topics': 340, 'alpha': 3.176237413449998}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:29:58,985][0m Trial 37 finished with value: 0.7472527472527474 and parameters: {'embeddings_length': 256, 'lr': 3.810318619882214e-05, 'n_topics': 173, 'alpha': 7.060298350195058}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:38:37,318][0m Trial 38 finished with value: 0.69164265129683 and parameters: {'embeddings_length': 256, 'lr': 0.0005526121751027013, 'n_topics': 248, 'alpha': 8.957435898579504}. Best is trial 19 with value: 0.7658536585365855.[0m


  0%|          | 0/396 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

[32m[I 2022-08-14 09:47:31,610][0m Trial 39 finished with value: 0.7596153846153847 and parameters: {'embeddings_length': 256, 'lr': 2.2693735743071594e-05, 'n_topics': 459, 'alpha': 2.1773580349180275}. Best is trial 19 with value: 0.7658536585365855.[0m


BEST TRAIL:
 f1:  [0.7658536585365855]
params: {'embeddings_length': 256, 'lr': 2.319098338283166e-05, 'n_topics': 375, 'alpha': 8.205941049502012}


In [22]:

    # train tBERT
corpus = create_sentences_corpus(train_dataloader)
model = tBERT(corpus, model_name='bert-base-uncased', num_labels=2,
             max_length=256, n_topics=100, alpha=5, device=DEVICE)
optimizer = AdamW(model.parameters(), lr=0.00005)
trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=1, device=DEVICE)

trainer.train()

labels, preds = trainer.evaluate(val_dataloader)


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

TypeError: cannot unpack non-iterable numpy.float64 object

In [23]:
labels, preds = trainer.evaluate(val_dataloader)


  0%|          | 0/17 [00:00<?, ?it/s]

TypeError: cannot unpack non-iterable numpy.float64 object