# Clasificación multiclase con Transformers

Utilizando BETO

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import metrics, model_selection
from transformers import AutoTokenizer, BertForSequenceClassification,AutoModelForSequenceClassification, BertConfig, TrainingArguments, Trainer, EarlyStoppingCallback,  DataCollatorWithPadding, get_linear_schedule_with_warmup
from datasets import load_metric,Dataset, DatasetDict, ClassLabel
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing


In [2]:
df = pd.read_excel("data/mercado-objetivo/mercados_procesados.xlsx")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12069 entries, 0 to 12068
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   codigo                12069 non-null  object
 1   mercado_objetivo      12069 non-null  object
 2   resumen_del_proyecto  12067 non-null  object
dtypes: object(3)
memory usage: 283.0+ KB


In [3]:
df = df.rename({'resumen_del_proyecto':'text', 'mercado_objetivo':'label'}, axis=1)
df = df[['text', 'label']]
df = df.dropna()
df.head()

Unnamed: 0,text,label
0,el ecommerce va en alza en chile y latam graci...,Telecomunicaciones y tecnologías de la informa...
1,las características organolépticas es un facto...,Vitivinícola
2,la máquina de perforación blind hole posee un ...,Minería y metalurgia extractiva
3,actualmente un grupo muy reducido 15000 de los...,Telecomunicaciones y tecnologías de la informa...
4,para lavar e higienizar prendas médicas hospit...,"Química, caucho y plásticos (excepto industria..."


In [4]:
df = df[6000:12000]
df = df.reset_index(drop=True)

In [5]:
le = preprocessing.LabelEncoder()
df.label = le.fit_transform(df.label.values)

In [None]:
df.head()

In [None]:
len(np.unique(df.label.values))

In [None]:
plt.figure(figsize=(20,8), dpi=60)
plt.hist(le.inverse_transform(df.label.values), bins=300)
plt.xticks(rotation = 90)
plt.show()

In [6]:
class args:
    model = "dccuchile/bert-base-spanish-wwm-cased"
    epochs = 5 #2
    batch_size = 64 #32
    learning_rate = 2e-2  #2e-5 
    train_batch_size = 64 
    valid_batch_size = 64
    max_len = 192 # 128
    accumulation_steps = 1
    test_size = 0.3 #03
    num_labels = len(np.unique(df.label.values))
    type = {'binary':'binary', 'multiclass':'weighted', 'micro':'micro', 'macro':'macro','multilabel':'samples'}
    device =  torch.device('cpu')
    dropout = 0.2

In [7]:
tokenizer = AutoTokenizer.from_pretrained(args.model, use_cache=False, architectures='BertForSequenceClassification')

In [None]:
tokenizer

In [8]:
df_train, df_valid = model_selection.train_test_split(df, test_size=args.test_size, stratify=df.label.values)

In [9]:
data_train = Dataset.from_pandas(df_train)
data_valid = Dataset.from_pandas(df_valid)

In [10]:
Bertdf = DatasetDict()
Bertdf['train'] = data_train
Bertdf['test'] = data_valid

In [None]:
Bertdf['train'][0]

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=args.max_len, padding='max_length')

In [12]:
tokenized_df= Bertdf.map(preprocess_function, batched=True)

100%|██████████| 5/5 [00:01<00:00,  2.51ba/s]
100%|██████████| 2/2 [00:00<00:00,  2.25ba/s]


In [None]:
tokenized_df

In [None]:
tokenized_df['train'][0]

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_df['train']['input_ids'][3]) 

In [None]:
#tokenized_df['train'].features['label'] =  ClassLabel(num_classes=args.num_labels, names=le.classes_, names_file=None, id=None)

In [None]:
tokenized_df['train']

In [None]:
token_lens = []

for txt in df.text:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  token_lens.append(len(tokens))

In [None]:
sns.histplot(token_lens, kde=True)
plt.xlim([0, 512])
plt.xlabel('Token count')

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=args.max_len, padding='max_length')

In [14]:
config = BertConfig.from_pretrained(args.model, num_labels= args.num_labels, classifier_dropout = args.dropout, 
                                    problem_type="single_label_classification", use_cache=False)
model = BertForSequenceClassification(config=config).to(args.device)  # remember to move to MPS!

In [15]:

#batches_per_epoch = len(tokenized_df["train"]) // args.batch_size
total_train_steps = int(len(tokenized_df["train"]) / args.batch_size * args.epochs)
total_train_steps

328

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=1e-2)
num_training_steps = args.epochs * len(tokenized_df["train"]) 
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
optimizers=optimizer, lr_scheduler

In [None]:
metric = load_metric('glue', 'mrpc')

In [17]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average = args.type['micro'])["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
class BETOTrainer(Trainer):
        
        def compute_loss(self, model, inputs, return_outputs=False):
                labels = inputs.get("labels")
                # forward pass
                outputs = model(**inputs)
                logits = outputs.get("logits")
                # compute custom loss (suppose one has 3 labels with different weights)
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                return (loss, outputs) if return_outputs else loss

In [21]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir = 'True',
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps = 50,
    do_train=True,
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.train_batch_size,
    per_device_eval_batch_size=args.valid_batch_size,
    num_train_epochs=args.epochs,
    weight_decay=0.01,
    metric_for_best_model = 'f1',
    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df["train"],
    eval_dataset=tokenized_df["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers= optimizers,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model(f'results/')

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4200
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 330
  1%|          | 3/330 [04:50<8:36:58, 94.86s/it] 

KeyboardInterrupt: 

In [None]:
outputs = trainer.predict(tokenized_df['test'])

In [None]:
predictions = outputs.predictions
predictions = np.argmax(predictions, axis=1)
predictions

In [None]:
np.array(df_train['label'].values)

In [None]:
predictions = outputs.predictions
predictions[1]


In [None]:
outputs.metrics

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment')

class_names = le.classes_
cm = metrics.confusion_matrix(outputs.label_ids, predictions)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
predictions