In [None]:
!pip install transformers

In [None]:
import pandas as pd


Lectura de datos 

In [None]:
FinalData = pd.read_excel('DatosFinalesModeloLimpios.xlsx')

In [None]:
display(FinalData)

Organizar los label, codificarlos para ser usados en el modelo.

In [None]:
labels=["PEDAGOGIA","EVALUACIÓN","GENERAL","EXPERIENCIA","COMPORTAMIENTO","CONOCIMIENTO"]

In [None]:
NUM_LABELS= len(labels)

In [None]:
id2label={i:l for i,l in enumerate(labels)}
label2id={l:i for i,l in enumerate(labels)}

In [None]:
FinalData["labels"]=FinalData.label.map(lambda x: label2id[x.strip()])

In [None]:
display(FinalData)

In [None]:
Data = FinalData.drop(['idx','label'], axis=1)

In [None]:
display(Data)

División de train y test

In [None]:
X = Data['sentence']
y = Data['labels']

In [None]:
!pip install sklearn 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=123)

In [None]:
X_val = X_val.dropna()

In [None]:
X_test = X_test.dropna()

In [None]:
train_labels = list(y_train)
val_labels = list(y_val)
test_labels = list(y_test)

In [None]:
len(test_labels)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_val.shape

In [None]:
from collections import Counter
train = Counter(train_labels)
train.most_common()

In [None]:
val =Counter(val_labels)
val.most_common()

In [None]:
test = Counter(test_labels)
test.most_common()

Preprocesamiento de datos para el modelo

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", padding=True, truncation=True, max_length=512)

In [None]:
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification

In [None]:
modelo = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=NUM_LABELS,  ignore_mismatched_sizes=True)

In [None]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings, test_labels)



In [None]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [None]:
len(train_dataset)

In [None]:
len(val_dataset)

In [None]:
!pip install datasets

In [None]:
!pip install seqeval

In [None]:
from sklearn.metrics import accuracy_score,  precision_recall_fscore_support, f1_score

In [None]:
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 f1 = f1_score(labels, preds, average="weighted")
 acc = accuracy_score(labels, preds)
 return {"accuracy": acc, "f1": f1}


In [None]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
    output_dir='./ModeloClasificación',          # output directory
    num_train_epochs=10,
    #learning_rate=1e-5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size=20,
    weight_decay=0.01,
    logging_dir= './multi-class-logs',
    evaluation_strategy="steps",
    disable_tqdm=False,
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    report_to='tensorboard',
    logging_strategy='steps',
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    modelo,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
%load_ext tensorboard
%tensorboard --logdir multi-class-logs/

In [None]:
q=[trainer.evaluate(eval_dataset=data) for data in [train_dataset, val_dataset, test_dataset]]

In [None]:
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

In [None]:
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
 cm = confusion_matrix(y_true, y_preds, normalize="true")
 fig, ax = plt.subplots(figsize=(16, 12))
 disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
 disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
 plt.title("Normalized confusion matrix")
 plt.show()

In [None]:
preds_output = trainer.predict(test_dataset)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
print(y_preds)

In [None]:
plot_confusion_matrix(y_preds, test_labels, labels)

In [None]:
modelo.save_pretrained("EtiquetadoTexto/EtiquetadoTexto")

In [None]:
tokenizer.save_pretrained("EtiquetadoTexto/EtiquetadoTexto")

In [None]:
from transformers import AutoModelForSequenceClassification
 
load_model = AutoModelForSequenceClassification.from_pretrained("EtiquetadoTexto/EtiquetadoTexto")
 
load_tokenizer = AutoTokenizer.from_pretrained("EtiquetadoTexto/EtiquetadoTexto")

In [None]:
from transformers import pipeline

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "EtiquetadoTexto/EtiquetadoTexto"
tokenizer = "EtiquetadoTexto/EtiquetadoTexto"
text_classifier = pipeline("text-classification", model=model_checkpoint, tokenizer= tokenizer, max_length=512, truncation=True)