In [84]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import transformers
%load_ext autoreload
%autoreload 2

transformers.logging.set_verbosity_error()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
from utils.seed import set_seed
set_seed(17)

# Load the data

In [86]:
from utils.data_processing import prepre_data_to_model
from utils.data_processing import clean_text
CUTOFF = 3
COL_TEXT_TO_USE = 'title'

train_base = prepre_data_to_model('data/semeval8/full_meta_en_train.csv', 'data/semeval8/en_train.csv', cutoff=CUTOFF, col_text_to_use=COL_TEXT_TO_USE)
test = prepre_data_to_model('data/semeval8/evaluate_metadata.csv', 'data/semeval8/evaluate_dataset.csv', cutoff=CUTOFF, col_text_to_use=COL_TEXT_TO_USE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_df['labels'][df['Overall'] >= cutoff] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_df['labels'][df['Overall'] >= cutoff] = 1


In [87]:
from nltk.corpus import stopwords
from cleantext import clean

def clean_text(s):
    # if not isinstance(s, str):
    #     print(s)
    # 1489983888
    if s and isinstance(s, str):
        s = s.lower()
        s = clean(text=s)
        s = " ".join([word for word in s.split(' ') if word not in stopwords.words('english')])
        return s
    return ''

In [88]:
for col in ['text1', 'text2']:
    # train_base.loc[:,col] = train_base[col].apply(clean_text)
    test.loc[:, col] = test[col].apply(clean_text)

In [89]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_base, test_size=0.2, shuffle=True, random_state=11)
train.shape, val.shape, test.shape

((1052, 3), (263, 3), (220, 3))

In [90]:
# convert the data to SentencesDataset

from datasets import SentencesDataset

train_ds = SentencesDataset(train.text1.tolist(), train.text1.tolist(), train.labels.tolist())
val_ds = SentencesDataset(val.text1.tolist(), val.text1.tolist(), val.labels.tolist())
test_ds = SentencesDataset(test.text1.tolist(), test.text1.tolist(), test.labels.tolist())

# Train

In [91]:
# import torch
# from models import tBERT
# from transformers import AdamW
#
# # PARAMETERS
#
# # train parameters
# batch_size = 16
# num_epochs = 1
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#
# # lda parameters
# n_topics = 70
#
# # embedding parameters
# max_length = 256
#
# #optimizer parameters
# lr = 5e-5

In [92]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_ds, shuffle=True, batch_size=16)

In [93]:
# from utils.data_processing import create_sentences_corpus
#
# corpus = create_sentences_corpus(train_dataloader)
# model = tBERT(corpus, n_topics=n_topics, max_length=max_length, device=device)
#
# optimizer = AdamW(model.parameters(), lr=lr)

In [94]:
# from trainer import Trainer
# trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=num_epochs, device=device)

In [95]:
# trainer.train()

In [96]:
# f1_train = trainer.evaluate(train_dataloader)
# print(f'{f1_train=}')

In [97]:
from utils.data_processing import create_sentences_corpus
import torch
from models import tBERT
from transformers import AdamW
from trainer import Trainer
import optuna
import json
import os

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
NUM_EPOCHS = 6
N_TRAILS = 40

def tbert_objective(trail):
    # PARAMS SETTING
    # Embedding
    embeddings_length = trail.suggest_categorical('embeddings_length', [256, 512])
    # General Model
    lr = trail.suggest_loguniform('lr', 1e-6, 1e-3)
    # LDA
    n_topics = trail.suggest_int('n_topics', 50, 500)
    alpha = trail.suggest_float('alpha', 1/50, 10)


    corpus = create_sentences_corpus(train_dataloader)
    model = tBERT(corpus, model_name='bert-base-uncased',num_labels=2,
             max_length=embeddings_length, n_topics=n_topics, alpha=alpha, device=DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr)

    trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=NUM_EPOCHS, device=DEVICE)

    trainer.train()
    f1 = trainer.evaluate(val_dataloader)
    return f1

if not os.path.isfile('results/tBERT_best_params.json'):

    study = optuna.create_study(direction='maximize')
    study.optimize(tbert_objective, n_trials=N_TRAILS)

    trail_ = study.best_trial
    with open('results/tBERT_best_params.json', 'w') as f:
        d = dict(trail_.params)
        d['f1'] = trail_.values[0]
        json.dump(d, f, indent=4)
    print(f'BEST TRAIL:\n f1:  {trail_.values}\nparams: {trail_.params}')

In [98]:
NUM_EPOCHS = 5

In [99]:
test_dataloader = DataLoader(test_ds, shuffle=True, batch_size=16)

In [None]:
# get best parameters
with open('results/tBERT_best_params.json', 'r') as f:
    params = json.load(f)

# train tBERT
corpus = create_sentences_corpus(train_dataloader)
model = tBERT(corpus, model_name='bert-base-uncased', num_labels=2,
             max_length=params['embeddings_length'], n_topics=params['n_topics'], alpha=params['alpha'], device=DEVICE)
optimizer = AdamW(model.parameters(), lr=params['lr'])
trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=NUM_EPOCHS, device=DEVICE)

trainer.train()

f1 =  trainer.evaluate(test_dataloader)
f1

  0%|          | 0/330 [00:00<?, ?it/s]

In [None]:

    # train tBERT
corpus = create_sentences_corpus(train_dataloader)
model = tBERT(corpus, model_name='bert-base-uncased', num_labels=2,
             max_length=256, n_topics=100, alpha=5, device=DEVICE)
optimizer = AdamW(model.parameters(), lr=0.00005)
trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=1, device=DEVICE)

trainer.train()

labels, preds = trainer.evaluate(val_dataloader)


In [None]:
labels, preds = trainer.evaluate(val_dataloader)
