In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
import datetime


#### Define alguns parâmetros para ambos os classificadores

In [2]:
model_base = "google-bert/bert-base-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#### Balanceamento da classe no dataset de treino

In [3]:
def balance_df(X, y):
    X_train_bal = X.copy()
    y_train_bal = y.copy()
    count_class = pd.DataFrame(y.value_counts()).reset_index()
    max_class_count = count_class.iloc[0]['count']
    count_class = count_class.iloc[1:]
    for _, row in count_class.iterrows():
        sample = y[y == row['class']].sample(max_class_count - row['count'], replace=True)
        X_train_bal = pd.concat([X_train_bal, X_train[sample.index]])
        y_train_bal = pd.concat([y_train_bal, sample])

    return (X_train_bal, y_train_bal)


# CSTR

In [4]:
cstr = pd.read_csv('../dados/atv3/CSTR.csv')
X_train, X_aux, y_train, y_aux = train_test_split(cstr['text'], cstr['class'], test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.66, random_state=42)

In [5]:
cstr_lb = LabelEncoder().fit(cstr['class'])
num_labels = len(cstr_lb.classes_)

In [6]:
print(cstr.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(299, 3)
(209,)
(30,)
(60,)


In [7]:
X_train_bal, y_train_bal = balance_df(X_train, y_train)
y_train_bal.value_counts()

class
Robotics                   85
ArtificiallIntelligence    85
Theory                     85
Systems                    85
Name: count, dtype: int64

In [8]:
tokenizer_cstr = AutoTokenizer.from_pretrained(model_base)

In [9]:
tokenizer_cstr.tokenize(X_train_bal.iloc[0], add_special_tokens=True)

['[CLS]',
 'One',
 'of',
 'the',
 'biggest',
 'challenges',
 'in',
 'systems',
 'ne',
 '##uro',
 '##science',
 'is',
 'a',
 'satisfactory',
 'model',
 'of',
 'neural',
 'signaling',
 '.',
 'From',
 'rate',
 'coding',
 'to',
 'temporal',
 'coding',
 ',',
 'models',
 'of',
 'neural',
 'signaling',
 'have',
 'been',
 'challenged',
 'by',
 'the',
 'fact',
 'that',
 'neurons',
 'fire',
 'highly',
 'irregular',
 '##ly',
 '.',
 'A',
 'typical',
 'interpretation',
 'of',
 'the',
 'var',
 '##iability',
 'is',
 '`',
 '`',
 'noise',
 'other',
 'than',
 'signal',
 "'",
 "'",
 ',',
 'which',
 'not',
 'only',
 'has',
 'difficulty',
 'accounting',
 'for',
 'the',
 'speed',
 ',',
 'accuracy',
 ',',
 'efficiency',
 'and',
 'complexity',
 'of',
 'biological',
 'systems',
 ',',
 'but',
 'is',
 'also',
 'con',
 '##tra',
 '##dict',
 '##ed',
 'by',
 'recent',
 'studies',
 'that',
 'show',
 'both',
 'spike',
 'generation',
 'and',
 'transmission',
 'are',
 'highly',
 'reliable',
 '.',
 'Challenge',
 '##d',
 

In [10]:
def tokenize_function(examples):
    return tokenizer_cstr(examples, padding="max_length", truncation=True)

In [11]:
train_ds = [dict(tokenize_function(X_train_bal.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_train_bal))]
val_ds = [dict(tokenize_function(X_val.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_val))]
test_ds = [dict(tokenize_function(X_test.iloc[i])) | {"label": int(c)} for i, c in enumerate(cstr_lb.transform(y_test))]

In [12]:
model_cstr = AutoModelForSequenceClassification.from_pretrained(model_base, num_labels=num_labels, torch_dtype="auto").to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
batch_size = 8
logging_step = len(train_ds)//batch_size
model_ckpt = str(datetime.datetime.today().date().isoformat())
model_name = f"{model_ckpt}-finetune-cstr"
model_name_final = f"{model_ckpt}-bert-finetune-cstr-final"
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}


In [14]:
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=1e-4,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_step,
                                  push_to_hub=False,
                                  log_level="error",
                                  load_best_model_at_end=True)

In [15]:
trainer = Trainer(model=model_cstr,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  processing_class=tokenizer_cstr
                  )

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.7429,0.801921,0.8,0.771429,0.8
2,0.2051,0.827584,0.833333,0.814614,0.833333


TrainOutput(global_step=86, training_loss=0.46993461736412934, metrics={'train_runtime': 98.9511, 'train_samples_per_second': 6.872, 'train_steps_per_second': 0.869, 'total_flos': 178918730465280.0, 'train_loss': 0.46993461736412934, 'epoch': 2.0})

In [17]:
trainer.save_model(model_name_final)

In [18]:
compute_metrics(trainer.predict(test_ds))

{'accuracy': 0.8,
 'f1_macro': np.float64(0.7928741235593271),
 'f1_micro': np.float64(0.8)}

# RE8

In [19]:
re8 = pd.read_csv('../dados/atv3/re8.csv')
X_train, X_aux, y_train, y_aux = train_test_split(re8['text'], re8['class'], test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.66, random_state=42)

In [20]:
re8_lb = LabelEncoder().fit(re8['class'])
num_labels = len(re8_lb.classes_)

In [21]:
print(re8.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(7674, 3)
(5371,)
(783,)
(1520,)


In [22]:
X_train_bal, y_train_bal = balance_df(X_train, y_train)
y_train_bal.value_counts()

class
earn        2771
money       2771
interest    2771
ship        2771
acq         2771
trade       2771
crude       2771
grain       2771
Name: count, dtype: int64

In [23]:
tokenizer_re8 = AutoTokenizer.from_pretrained(model_base)

In [24]:
tokenizer_re8.tokenize(X_train_bal.iloc[0], add_special_tokens=True)

['[CLS]',
 'un',
 '##iba',
 '##nco',
 '##rp',
 'in',
 '##c',
 'u',
 '##b',
 '##c',
 '##p',
 'regular',
 'divide',
 '##nd',
 'set',
 'q',
 '##tly',
 'di',
 '##v',
 'c',
 '##ts',
 'vs',
 'c',
 '##ts',
 'previously',
 'pay',
 'a',
 '##p',
 '##ril',
 'record',
 'march',
 're',
 '##uter',
 '[SEP]']

In [25]:
def tokenize_function(examples):
    return tokenizer_re8(examples, padding="max_length", truncation=True)

In [26]:
train_ds = [dict(tokenize_function(X_train_bal.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_train_bal))]
val_ds = [dict(tokenize_function(X_val.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_val))]
test_ds = [dict(tokenize_function(X_test.iloc[i])) | {"label": int(c)} for i, c in enumerate(re8_lb.transform(y_test))]

In [27]:
model_re8 = AutoModelForSequenceClassification.from_pretrained(model_base, num_labels=num_labels, torch_dtype="auto").to(device)

In [28]:
batch_size = 8
logging_step = len(train_ds)//batch_size
model_ckpt = str(datetime.datetime.today().date().isoformat())
model_name = f"{model_ckpt}-finetune-re8"
model_name_final = f"{model_ckpt}-bert-finetune-re8-final"
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}


In [29]:
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=1e-4,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_step,
                                  push_to_hub=False,
                                  log_level="error",
                                  load_best_model_at_end=True)

In [30]:
trainer = Trainer(model=model_re8,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  processing_class=tokenizer_re8
                  )

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,2.0076,2.04358,0.028097,0.006832,0.028097
2,2.0993,2.101748,0.010217,0.002528,0.010217


TrainOutput(global_step=5542, training_loss=2.0534499687048897, metrics={'train_runtime': 5547.4652, 'train_samples_per_second': 7.992, 'train_steps_per_second': 0.999, 'total_flos': 1.1665920178126848e+16, 'train_loss': 2.0534499687048897, 'epoch': 2.0})

In [32]:
trainer.save_model(model_name_final)

In [33]:
compute_metrics(trainer.predict(test_ds))

{'accuracy': 0.034210526315789476,
 'f1_macro': np.float64(0.00826972010178117),
 'f1_micro': np.float64(0.034210526315789476)}