In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import re

def remove_html_tags(text):
    if isinstance(text, str):
        clean_text = re.sub(r'<[^>]+>', '', text)
        clean_text = re.sub(r'^{html}', '', clean_text)
        return clean_text.strip()
    return text

def remove_urls(text):
    if isinstance(text, str):
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        clean_text = re.sub(url_pattern, '', text)
        return clean_text.strip()
    return text

df = pd.read_csv("hf://datasets/giseldo/deep-se/deep-se.csv")
#df = pd.read_csv("dataset/deep-se.csv")

df = df[df['storypoint'] != 0]
df = df.dropna(subset=['storypoint', 'title', 'description'])

df['title'] = df['title'].apply(remove_html_tags)
df['description'] = df['description'].apply(remove_html_tags)

df['title'] = df['title'].apply(remove_urls)
df['description'] = df['description'].apply(remove_urls)

df['context'] = df['title'] + " " + df['description']

split_idx = int(len(df) * 0.7)
df = df.iloc[:split_idx]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

dataset = Dataset.from_pandas(pd.concat([train_df, test_df], keys=["train", "test"], names=["split"]))
dataset = dataset.train_test_split(test_size=0.2)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    return tokenizer(example["context"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess)

tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": float(x["storypoint"])}, remove_columns=["storypoint", "context"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Add this line to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="mse",
)

from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    return {"mse": mean_squared_error(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("./story_point_predictor")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(remove_html_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(remove_html_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(remove_urls)
A value is trying to be set on a copy of a slice

Map:   0%|          | 0/11805 [00:00<?, ? examples/s]

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/11805 [00:00<?, ? examples/s]

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mgiseldo[0m ([33mgiseldo-instituto-federal-de-alagoas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Mse
1,140.1779,102.263214,102.263214
2,103.9606,90.528061,90.528053
3,64.4704,88.363815,88.363815
4,57.7102,86.906853,86.906853


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model = AutoModelForSequenceClassification.from_pretrained("./story_point_predictor")
model.eval()

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

exemplo_texto = "Implementar autenticação de usuários com JWT e refresh token"

inputs = tokenizer(exemplo_texto, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

with torch.no_grad():
    outputs = model(**inputs)
    predicted_story_points = outputs.logits.item()

print(f"\nExemplo de inferência:")
print(f"Texto: {exemplo_texto}")
print(f"Story Points previstos: {predicted_story_points:.2f}")
