In [1]:
!pip install -q transformers datasets accelerate textattack evaluate



In [2]:
# import importlib, sys;
# sys.modules.pop('transformers', None)
# import transformers, datasets, torch
# print(transformers.__version__)   # debería mostrar 4.39.x (o similar)


In [3]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-base-uncased"      # 110 M parámetros, inglés minúsculas
tokenizer  = BertTokenizerFast.from_pretrained(model_name)
model      = BertForSequenceClassification.from_pretrained(
                model_name,
                num_labels=2)         # positivo / negativo


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")     # splits: train/validation/test

def tokenize(batch):
    return tokenizer(batch["sentence"],
                     truncation=True,
                     padding="max_length",
                     max_length=128)

ds = ds.map(tokenize, batched=True)
ds = ds.rename_column("label", "labels")   # lo que espera Transformers
ds.set_format(type="torch",
              columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [5]:
# import importlib, sys, inspect
# import transformers

# # fuerza recarga de todo el paquete transformers
# sys.modules.pop('transformers', None)
# transformers = importlib.import_module('transformers')
# from transformers import TrainingArguments

# print("version:", transformers.__version__)
# print("'evaluation_strategy' in signature ->",
#       "evaluation_strategy" in inspect.signature(TrainingArguments).parameters)


In [6]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="bert_sst2",

    # ⇩ NUEVOS NOMBRES ⇩
    eval_strategy   ="epoch",      # ← antes “evaluation_strategy”
    save_strategy   ="epoch",
    logging_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    report_to="none",          # ← doble seguridad

)


In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

import numpy as np
import evaluate
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

trainer = Trainer(
    model,
    args,
    train_dataset = ds["train"],
    eval_dataset  = ds["validation"],
    tokenizer     = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2205,0.243783,0.925459
2,0.119,0.307944,0.925459
3,0.075,0.331852,0.928899


TrainOutput(global_step=12630, training_loss=0.13816237642193066, metrics={'train_runtime': 4499.2306, 'train_samples_per_second': 44.907, 'train_steps_per_second': 2.807, 'total_flos': 1.329019985058048e+16, 'train_loss': 0.13816237642193066, 'epoch': 3.0})

In [8]:
results = trainer.evaluate(ds["validation"])
print("Accuracy clean (val) =", results["eval_accuracy"])

model.save_pretrained("bert_clean")      # pesos y config
tokenizer.save_pretrained("bert_clean")  # tokenizer


Accuracy clean (val) = 0.9288990825688074


('bert_clean/tokenizer_config.json',
 'bert_clean/special_tokens_map.json',
 'bert_clean/vocab.txt',
 'bert_clean/added_tokens.json',
 'bert_clean/tokenizer.json')

In [11]:
# ==========================================================
#  SUBE NOTEBOOK + HISTORY  (pesos van a Drive, NO a Git)
# ==========================================================
import os, shutil, subprocess, getpass, pathlib, pandas as pd, glob

REPO_URL = "https://github.com/guillegrc/TFM.git"
REPO_DIR = "/content/TFM"

NOTEBOOK_PATH = "/content/drive/MyDrive/Colab Notebooks/BERT_SST-2.ipynb"
HIST_CSV      = "/content/history_bert.csv"
DRIVE_WEIGHTS_DST = "/content/drive/MyDrive/modelos_TFM/bert_clean"

# 1) guardar historia
pd.DataFrame(trainer.state.log_history).to_csv(HIST_CSV, index=False)
print("✔ history_bert.csv guardado")

# 2) clonar repo si hace falta
if not pathlib.Path(REPO_DIR, ".git").exists():
    token = getpass.getpass("🔑 PAT (deja vacío si el repo es público): ")
    clone_url = REPO_URL.replace("https://", f"https://{token}@") if token else REPO_URL
    subprocess.run(["git","clone", clone_url, REPO_DIR], check=True)
else:
    print("✔ Repo ya clonado")

os.chdir(REPO_DIR)

# 3) copiar cuaderno y CSV al repo
shutil.copy2(NOTEBOOK_PATH, pathlib.Path(REPO_DIR, pathlib.Path(NOTEBOOK_PATH).name))
shutil.copy2(HIST_CSV,      pathlib.Path(REPO_DIR, "history_bert.csv"))
print("✔ Notebook y history copiados al repo")

# 4) copiar pesos a Drive (siempre se sobreescriben)
if pathlib.Path(DRIVE_WEIGHTS_DST).exists():
    shutil.rmtree(DRIVE_WEIGHTS_DST)
shutil.copytree("/content/bert_clean", DRIVE_WEIGHTS_DST)
print("✔ Pesos copiados a", DRIVE_WEIGHTS_DST)

# 5) config Git
subprocess.run(["git","config","--global","user.email","ggarciama-inf@upsa.es"])
subprocess.run(["git","config","--global","user.name","guillegrc"])

# 6) add / commit solo de archivos ligeros
subprocess.run(["git","add", pathlib.Path(NOTEBOOK_PATH).name, "history_bert.csv"])
try:
    subprocess.run(["git","commit","-m","BERT SST‑2: notebook + history"], check=True)
except subprocess.CalledProcessError:
    print("ℹ️ Nada nuevo que commitear")

# 7) push
token = os.getenv("GH_TOKEN") or getpass.getpass("🔑 PAT para push (vacío si no hace falta): ")
if token:
    subprocess.run(["git","remote","set-url","origin",
                    REPO_URL.replace("https://", f"https://{token}@")])

push_res = subprocess.run(["git","push","origin","main"], capture_output=True, text=True)
if push_res.returncode == 0:
    print("✅  Push completado")
else:
    print(push_res.stderr)          # muestra el motivo real
    raise RuntimeError("❌  push falló")



✔ history_bert.csv guardado
✔ Repo ya clonado


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/BERT_SST-2.ipynb'

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
