In [None]:
!pip install transformers
!pip install peft
!pip install evaluate



In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/datos

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/datos


El código carga un modelo preentrenado de clasificación de secuencias llamado "BertForSequenceClassification" del paquete "transformers". Este modelo utiliza el tokenizador "BertTokenizer" también del paquete "transformers". El modelo está preentrenado con la base "bert-base-uncased" y se espera que clasifique en 5 categorías. Finalmente, el modelo se mueve a la GPU para una mayor velocidad de procesamiento:

In [None]:
import pandas as pd
df = pd.read_excel('tweets.xlsx')
df

Unnamed: 0,name,text
0,elonmusk,Highly recommend “The Explorers” podcast for s...
1,JeffBezos,@BlueMoonBrewCo First round is on me. (2/2) an...
2,BillGates,Congratulations on a remarkable win in #India...
3,MikeBloomberg,American greatness began with immigrants. Choo...
4,MichaelDell,@sakacc @DellTech @VMware Enjoy the very well-...
...,...,...
108,sebastianpinera,Health sector: those belonging to the 40% most...
109,swimmym,In addition to the content of the choir compet...
110,yousuck2020,I bought a bookshelf in the shape of a cypress...
111,salinas,@CitlaHM @ferbelaunzaran @lopezobrador_ Cenado...


In [None]:
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
import numpy as np
import random
import torch
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

#configuracion previa (ajuste loRA)
model_name_or_path = "roberta-large"
peft_type = PeftType.LORA
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# Definir el nombre de las columnas del dataset
column_names = ['name', 'text']

# Convertir el dataframe en un diccionario
data = {
    column: df[column].tolist()
    for column in column_names
}

# Crear el dataset de Hugging Face
dataset = Dataset.from_dict(data)

if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

# Tokenizar
tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
def tokenize_function(examples):
        outputs = tokenizer(examples["text"], truncation=True, max_length=512)
        return outputs
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["name"],
)

def collate_fn(examples):
    return tokenizer.pad(examples, padding='longest', return_tensors="pt")

# Arquitectura
device = "cuda"
num_epochs = 4
batch_size = 1
lr = 5e-5
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
num_labels=5
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels)
model.task_name = "classification"
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.cuda()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 2,895,882 || all params: 357,206,026 || trainable%: 0.8107035685898535


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=1024, out_features=1024, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                

In [None]:
import torch
model_path = "modelo_final.pt"
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [None]:
def get_labels():
    return ['ext', 'neu', 'agr', 'con', 'opn']

def predict(model, tokenizer, text):
    # Codificar el texto
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to("cuda")

    # Obtener las probabilidades de cada clase
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs[0]
        probs = torch.softmax(logits, dim=1)

    # Obtener las probabilidades para cada clase
    class_probs = [prob.item() for prob in probs[0]]

    labels = get_labels()
    results = list(zip(labels, class_probs))

    return results

In [None]:
results = df['text'].apply(lambda x: predict(model, tokenizer, x))

In [None]:
import pandas as pd

# Crear una lista vacía para almacenar los resultados
data = []

# Iterar sobre cada fila de resultados
for i, row in results.iteritems():
    # Crear un diccionario para almacenar los resultados de esta fila
    row_data = {'name': df.loc[i, 'name']}

    # Iterar sobre cada etiqueta y probabilidad
    for label, prob in row:
        # Agregar la probabilidad al diccionario
        row_data[label] = prob

    # Agregar el diccionario a la lista de datos
    data.append(row_data)

# Crear el nuevo dataframe a partir de la lista de datos
df2 = pd.DataFrame(data)


  for i, row in results.iteritems():


In [None]:
df2

Unnamed: 0,name,ext,neu,agr,con,opn
0,elonmusk,0.069370,0.039898,0.072829,0.030814,0.787089
1,JeffBezos,0.101953,0.107155,0.184817,0.118599,0.487475
2,BillGates,0.234365,0.101303,0.320444,0.217930,0.125959
3,MikeBloomberg,0.143344,0.111059,0.275309,0.155646,0.314642
4,MichaelDell,0.211101,0.066243,0.208494,0.116558,0.397603
...,...,...,...,...,...,...
108,sebastianpinera,0.098655,0.098517,0.321092,0.340611,0.141125
109,swimmym,0.119296,0.123852,0.217929,0.211007,0.327916
110,yousuck2020,0.118954,0.084810,0.251754,0.172431,0.372052
111,salinas,0.127424,0.142015,0.173452,0.086544,0.470566


In [None]:
# Descarga el archivo
!pip install openpyxl
import openpyxl

import pandas as pd

with pd.ExcelWriter('df2.xlsx', engine='openpyxl') as writer:
    df2.to_excel(writer)

from google.colab import files
files.download('df2.xlsx')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>