In [5]:
!pip install datasets



In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
%load_ext cudf.pandas

In [7]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_csv("hf://datasets/giseldo/deep-se/deep-se.csv")

df.to_csv("deep-se.csv", index=False)

In [9]:
import pandas as pd
import json
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import re

def remove_html_tags(text):
    if isinstance(text, str):
        # Remove HTML tags
        clean_text = re.sub(r'<[^>]+>', '', text)
        # Remove {html} prefix if present
        clean_text = re.sub(r'^{html}', '', clean_text)
        return clean_text.strip()
    return text

def remove_urls(text):
    if isinstance(text, str):
        # Remove URLs
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        clean_text = re.sub(url_pattern, '', text)
        return clean_text.strip()
    return text

def transform_data_for_fine_tuning(input_file: str, output_file: str, sample_size: int = 100):
    """
    Transform the dataset into the format required for fine-tuning.
    The format will be a JSONL file where each line contains a prompt and completion.

    Args:
        input_file (str): Path to the input CSV file
        output_file (str): Path to save the output JSONL file
        sample_size (int): Number of samples to use for fine-tuning (default: 100)
    """
    # Read the dataset
    df = pd.read_csv(input_file)

    df = df.dropna()

    # Take a random sample of the data
    df = df.sample(n=min(sample_size, len(df)), random_state=42)

    # Remove HTML tags from title and description columns
    df['title'] = df['title'].apply(remove_html_tags)
    df['description'] = df['description'].apply(remove_html_tags)
    # Remove URLs from title and description columns
    df['title'] = df['title'].apply(remove_urls)
    df['description'] = df['description'].apply(remove_urls)

    # Create the fine-tuning dataset
    fine_tuning_data = []

    for _, row in df.iterrows():
        # Create the prompt with the user story description
        prompt = f"Estimate the story points for this user story:\n{row['description']}\n\nStory points:"

        # The completion will be just the story points number
        completion = str(row['storypoint'])

        # Create the training example
        example = {
            "text": prompt + completion
        }

        fine_tuning_data.append(example)

    # Save the transformed data
    with open(output_file, 'w', encoding='utf-8') as f:
        for example in fine_tuning_data:
            f.write(json.dumps(example, ensure_ascii=False) + '\n')

def fine_tune_model(training_file: str, model_name: str = "google/gemma-2b"):
    """
    Fine-tune the model using Hugging Face transformers
    """
    # Load the model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Set padding token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load the training data
    with open(training_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    # Convert to Hugging Face dataset
    dataset = Dataset.from_list(data)

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    # Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=500,
        gradient_accumulation_steps=4,
        fp16=True if torch.cuda.is_available() else False,
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )

    # Start training
    print("Starting fine-tuning...")
    trainer.train()

    # Save the fine-tuned model
    model_save_path = Path("model_ajuste_fino/fine_tuned_model")
    model_save_path.mkdir(parents=True, exist_ok=True)
    trainer.save_model(str(model_save_path))
    tokenizer.save_pretrained(str(model_save_path))
    print(f"Fine-tuned model saved to {model_save_path}")

def main():
    # Define file paths
    input_file = Path("deep-se.csv")
    output_file = Path("model_ajuste_fino/fine_tuning_data.jsonl")

    # Create output directory if it doesn't exist
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # Transform the data
    print("Transforming data for fine-tuning...")
    transform_data_for_fine_tuning(input_file, output_file)

    # Fine-tune the model
    print("Starting model fine-tuning...")
    fine_tune_model(str(output_file))

if __name__ == "__main__":
    main()


Transforming data for fine-tuning...
Starting model fine-tuning...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Starting fine-tuning...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgiseldo[0m ([33mgiseldo-instituto-federal-de-alagoas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 39.56 GiB of which 1.49 GiB is free. Process 6128 has 38.06 GiB memory in use. Of the allocated memory 37.36 GiB is allocated by PyTorch, and 196.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)