In [1]:
import json
import random
import warnings
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

ModuleNotFoundError: No module named 'keras'

In [2]:
train_path = 'Data/train.json'
dev_path = 'Data/validation.json'
test_path = 'Data/test.json'
device = 'cuda'

with open(train_path, 'r') as f:
    train = json.load(f)
with open(dev_path, 'r') as f:
    dev = json.load(f)
with open(test_path, 'r') as f:
    test = json.load(f)

In [3]:
import tensorflow as tf
from tensorflow import kera

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
model_name = "dbmdz/bert-base-turkish-128k-uncased"
# model_name = "bert-base-multilingual-cased"
# model_name = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

In [4]:
from tqdm import tqdm

In [4]:
mapping = {'negative':0, 'neutral':1, 'positive':2}

def to_id(text):
    return torch.tensor(tokenizer.encode(text))

def feature_extraction(data, max_seq_length=50):
    sentences = [el["sentence"] for el in data]
    y = [mapping[el["value"]] for el in data]
    
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=max_seq_length), y

In [5]:
class BOUNDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
categories = [0, 1, 2]

x_train, y_train = feature_extraction(train)
train_set = BOUNDataset(x_train, y_train)
x, y = feature_extraction(dev)
dev_set = BOUNDataset(x, y)
x, y = feature_extraction(test)
test_set = BOUNDataset(x, y)

In [7]:
from sklearn.utils.class_weight import compute_class_weight
weights = torch.Tensor(compute_class_weight(classes=[0,1,2], y=y_train, class_weight="balanced")).to(device)

In [8]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


from transformers import Trainer, TrainingArguments

class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels)
        return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir='transformer_results',
    num_train_epochs=10,
    per_device_train_batch_size=24,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=36,
    warmup_steps=300,
    weight_decay=0.1,
    learning_rate=1e-5,
    logging_dir='transformer_logs',
    logging_steps=5,
    load_best_model_at_end=True,
    evaluation_strategy = "epoch",
    metric_for_best_model = "recall",
    do_eval=True,
    save_total_limit=1)

trainer = MultilabelTrainer(
    args=training_args,
    train_dataset=train_set,
    eval_dataset=dev_set,
    compute_metrics=compute_metrics,
    model_init=model_init
)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [None]:
trainer.train() 
# if you want to use already finetuned model, use the command below instead
# trainer.model = trainer.model.from_pretrained("berturk").to(device) 

In [10]:
trainer.evaluate(dev_set)

{'eval_loss': 0.8129351735115051,
 'eval_accuracy': 0.7449139280125195,
 'eval_f1': 0.7148205862964074,
 'eval_precision': 0.705887006955344,
 'eval_recall': 0.7303927466978029,
 'eval_runtime': 1.3912,
 'eval_samples_per_second': 459.327,
 'init_mem_cpu_alloc_delta': 1069550,
 'init_mem_gpu_alloc_delta': 738968064,
 'init_mem_cpu_peaked_delta': 221957,
 'init_mem_gpu_peaked_delta': 0,
 'eval_mem_cpu_alloc_delta': 262459,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 18230,
 'eval_mem_gpu_peaked_delta': 62795264}

In [11]:
trainer.evaluate(test_set)

{'eval_loss': 0.737638533115387,
 'eval_accuracy': 0.7229899497487438,
 'eval_f1': 0.700871659759735,
 'eval_precision': 0.6919122642064153,
 'eval_recall': 0.7283327473798605,
 'eval_runtime': 3.3969,
 'eval_samples_per_second': 468.667,
 'eval_mem_cpu_alloc_delta': 83872,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 51420,
 'eval_mem_gpu_peaked_delta': 62819328}