In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
import datetime


#### Define alguns parâmetros para ambos os classificadores

In [2]:
model_base = "google-bert/bert-base-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#### Balanceamento da classe na dataset de treino

In [7]:
def balance_df(X, y):
    X_train_bal = X.copy()
    y_train_bal = y.copy()
    count_class = pd.DataFrame(y.value_counts()).reset_index()
    max_class_count = count_class.iloc[0]['count']
    count_class = count_class.iloc[1:]
    for _, row in count_class.iterrows():
        sample = y[y == row['class']].sample(max_class_count - row['count'], replace=True)
        X_train_bal = pd.concat([X_train_bal, X_train[sample.index]])
        y_train_bal = pd.concat([y_train_bal, sample])

    return (X_train_bal, y_train_bal)


# CSTR

In [3]:
cstr = pd.read_csv('../dados/atv3/CSTR.csv')
X_train, X_aux, y_train, y_aux = train_test_split(cstr['text'], cstr['class'], test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.66, random_state=42)

In [4]:
cstr_lb = LabelEncoder().fit(cstr['class'])
num_labels = len(cstr_lb.classes_)

In [5]:
print(cstr.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(299, 3)
(209,)
(30,)
(60,)


In [7]:
X_train_bal, y_train_bal = balance_df(X_train, y_train)
y_train_bal.value_counts()

class
Robotics                   85
ArtificiallIntelligence    85
Theory                     85
Systems                    85
Name: count, dtype: int64

In [8]:
tokenizer_cstr = AutoTokenizer.from_pretrained(model_base)

In [51]:
tokenizer_cstr.tokenize(X_train_bal.iloc[0], add_special_tokens=True)

['[CLS]',
 'un',
 '##iba',
 '##nco',
 '##rp',
 'in',
 '##c',
 'u',
 '##b',
 '##c',
 '##p',
 'regular',
 'divide',
 '##nd',
 'set',
 'q',
 '##tly',
 'di',
 '##v',
 'c',
 '##ts',
 'vs',
 'c',
 '##ts',
 'previously',
 'pay',
 'a',
 '##p',
 '##ril',
 'record',
 'march',
 're',
 '##uter',
 '[SEP]']

In [10]:
def tokenize_function(examples):
    return tokenizer_cstr(examples, padding="max_length", truncation=True)

In [11]:
train_ds = [dict(tokenize_function(X_train_bal.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_train_bal))]
val_ds = [dict(tokenize_function(X_val.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_val))]
test_ds = [dict(tokenize_function(X_test.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_test))]

In [12]:
model_cstr = AutoModelForSequenceClassification.from_pretrained(model_base, num_labels=num_labels, torch_dtype="auto").to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
batch_size = 16
logging_step = len(train_ds)//batch_size
model_ckpt = str(datetime.datetime.today().date().isoformat())
model_name = f"{model_ckpt}-finetune-cstr"
model_name_final = f"{model_ckpt}-bert-finetune-cstr-final"
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}


In [28]:
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=1e-4,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_step,
                                  push_to_hub=False,
                                  log_level="error",
                                  load_best_model_at_end=True)



In [29]:
trainer = Trainer(model=model_cstr,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  processing_class=tokenizer_cstr
                  )

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.4719,0.680709,0.833333,0.814614,0.833333
2,0.1581,0.595413,0.833333,0.786457,0.833333


TrainOutput(global_step=86, training_loss=0.3106432681859926, metrics={'train_runtime': 111.598, 'train_samples_per_second': 6.093, 'train_steps_per_second': 0.771, 'total_flos': 178918730465280.0, 'train_loss': 0.3106432681859926, 'epoch': 2.0})

In [36]:
trainer.save_model(model_name_final)

In [38]:
compute_metrics(trainer.predict(test_ds))

{'accuracy': 0.8333333333333334,
 'f1_macro': np.float64(0.8289159341897678),
 'f1_micro': np.float64(0.8333333333333334)}

# RE8

In [3]:
re8 = pd.read_csv('../dados/atv3/re8.csv')
X_train, X_aux, y_train, y_aux = train_test_split(re8['text'], re8['class'], test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.66, random_state=42)

In [4]:
re8_lb = LabelEncoder().fit(re8['class'])
num_labels = len(re8_lb.classes_)

In [5]:
print(re8.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(7674, 3)
(5371,)
(783,)
(1520,)


In [8]:
X_train_bal, y_train_bal = balance_df(X_train, y_train)
y_train_bal.value_counts()

class
earn        2771
money       2771
interest    2771
ship        2771
acq         2771
trade       2771
crude       2771
grain       2771
Name: count, dtype: int64

In [9]:
tokenizer_re8 = AutoTokenizer.from_pretrained(model_base)

In [10]:
tokenizer_re8.tokenize(X_train_bal.iloc[0], add_special_tokens=True)

['[CLS]',
 'un',
 '##iba',
 '##nco',
 '##rp',
 'in',
 '##c',
 'u',
 '##b',
 '##c',
 '##p',
 'regular',
 'divide',
 '##nd',
 'set',
 'q',
 '##tly',
 'di',
 '##v',
 'c',
 '##ts',
 'vs',
 'c',
 '##ts',
 'previously',
 'pay',
 'a',
 '##p',
 '##ril',
 'record',
 'march',
 're',
 '##uter',
 '[SEP]']

In [11]:
def tokenize_function(examples):
    return tokenizer_re8(examples, padding="max_length", truncation=True)

In [12]:
train_ds = [dict(tokenize_function(X_train_bal.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_train_bal))]
val_ds = [dict(tokenize_function(X_val.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_val))]
test_ds = [dict(tokenize_function(X_test.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_test))]

In [13]:
model_re8 = AutoModelForSequenceClassification.from_pretrained(model_base, num_labels=num_labels, torch_dtype="auto").to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
batch_size = 16
logging_step = len(train_ds)//batch_size
model_ckpt = str(datetime.datetime.today().date().isoformat())
model_name = f"{model_ckpt}-finetune-re8"
model_name_final = f"{model_ckpt}-bert-finetune-re8-final"
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}


In [15]:
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=1e-4,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_step,
                                  push_to_hub=False,
                                  log_level="error",
                                  load_best_model_at_end=True)



In [18]:
trainer = Trainer(model=model_re8,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  processing_class=tokenizer_re8
                  )

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.2279,0.201838,0.96424,0.906374,0.96424
2,0.0526,0.096913,0.984674,0.961595,0.984674


TrainOutput(global_step=5542, training_loss=0.1402223959419969, metrics={'train_runtime': 5354.9255, 'train_samples_per_second': 8.279, 'train_steps_per_second': 1.035, 'total_flos': 1.1665920178126848e+16, 'train_loss': 0.1402223959419969, 'epoch': 2.0})

In [20]:
trainer.save_model(model_name_final)

In [21]:
compute_metrics(trainer.predict(test_ds))

{'accuracy': 0.9756578947368421,
 'f1_macro': np.float64(0.9496402828135407),
 'f1_micro': np.float64(0.9756578947368421)}