In [None]:
#pip install transformers datasets torch tokenizers


In [None]:
import requests
import os
from github import Github

# GitHub API Token (Generate from GitHub Developer Settings)
GITHUB_TOKEN = "your_github_token"

# Initialize GitHub API Client
g = Github(GITHUB_TOKEN)

# Search for Flutter repositories
query = "language:Dart flutter"
repos = g.search_repositories(query=query, sort="stars", order="desc")

# Directory to save Dart files
os.makedirs("flutter_dataset", exist_ok=True)

for repo in repos[:50]:  # Limit to 50 repositories
    print(f"Cloning: {repo.full_name}")
    try:
        contents = repo.get_contents("")
        for file in contents:
            if file.path.endswith(".dart"):
                dart_code = requests.get(file.download_url).text
                with open(f"flutter_dataset/{file.name}", "w", encoding="utf-8") as f:
                    f.write(dart_code)
    except Exception as e:
        print(f"Skipping {repo.full_name}: {e}")

print("✅ Flutter dataset collected!")


In [7]:
from datasets import load_dataset

ds = load_dataset("wraps/codegen-flutter-v1")

In [None]:
print(ds["train"].column_names)

In [2]:
import torch

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from datasets import DatasetDict, Dataset, load_dataset
from transformers import AutoTokenizer

# Load a tokenizer (CodeT5 is recommended for code-based tasks)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")

def split_code(content):
    """
    Splits Dart code near function headers.
    Returns (input_part, output_part).
    """
    lines = content.split("\n")
    split_idx = None

    # Find a suitable split point near function headers
    for i, line in enumerate(lines):
        if "{" in line and "(" in line and "class " not in line:  # Rough function header detection
            split_idx = i + 1
            break

    if split_idx is None or split_idx >= len(lines) - 1:
        return content, ""  # No split found, use full content

    input_part = "\n".join(lines[:split_idx])
    output_part = "\n".join(lines[split_idx:])

    return input_part, output_part

def process_data(example):
    """
    Process dataset to create input-output pairs for sequence-to-sequence learning.
    """
    input_part, output_part = split_code(example["content"])

    # Tokenize input and output
    input_encodings = tokenizer(input_part, padding="max_length", truncation=True, max_length=512)
    output_encodings = tokenizer(output_part, padding="max_length", truncation=True, max_length=512)

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "decoder_input_ids": output_encodings["input_ids"][:-1],  # Shifted left
        "labels": output_encodings["input_ids"][1:],  # Shifted right
    }

# Load the dataset
dataset =load_dataset("wraps/codegen-flutter-v1")

# Apply transformation
processed_dataset = dataset["train"].map(process_data, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])

# Save processed dataset
processed_dataset.save_to_disk("processed_dart_dataset")


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/69.7M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/77.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/149599 [00:00<?, ? examples/s]

Map:   0%|          | 0/149599 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/149599 [00:00<?, ? examples/s]

In [None]:
!zip -r /kaggle/working/processed_dart_dataset.zip /kaggle/working/processed_dart_dataset

In [5]:

# Ensure it's in the correct format
tokenized_datasets = {
    "train": processed_dataset.train_test_split(test_size=0.1)["train"],
    "test": processed_dataset.train_test_split(test_size=0.1)["test"]
}


In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
MODEL_NAME = "Salesforce/codet5-Small"

# Load CodeT5 model
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./codeT5-flutter",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    log_level="info",
    fp16=True,  # Enables mixed precision for speedup
    dataloader_num_workers=4
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

# Start fine-tuning
trainer.train()

# Save fine-tuned model
model.save_pretrained("./fine-tuned-codet5-flutter")
tokenizer.save_pretrained("./fine-tuned-codet5-flutter")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Salesforce--codet5-Small/snapshots/b1ee9570c289f21b5922b9c768a1ce12957bf968/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "r

Epoch,Training Loss,Validation Loss
1,0.6693,0.586586
2,0.594,0.534654
3,0.4603,0.519799



***** Running Evaluation *****
  Num examples = 14960
  Batch size = 8
Saving model checkpoint to ./codeT5-flutter/checkpoint-16830
Configuration saved in ./codeT5-flutter/checkpoint-16830/config.json
Configuration saved in ./codeT5-flutter/checkpoint-16830/generation_config.json
Model weights saved in ./codeT5-flutter/checkpoint-16830/model.safetensors

***** Running Evaluation *****
  Num examples = 14960
  Batch size = 8
Saving model checkpoint to ./codeT5-flutter/checkpoint-33660
Configuration saved in ./codeT5-flutter/checkpoint-33660/config.json
Configuration saved in ./codeT5-flutter/checkpoint-33660/generation_config.json
Model weights saved in ./codeT5-flutter/checkpoint-33660/model.safetensors
Saving model checkpoint to ./codeT5-flutter/checkpoint-50490
Configuration saved in ./codeT5-flutter/checkpoint-50490/config.json
Configuration saved in ./codeT5-flutter/checkpoint-50490/generation_config.json
Model weights saved in ./codeT5-flutter/checkpoint-50490/model.safetensors
D

('./fine-tuned-codet5-flutter/tokenizer_config.json',
 './fine-tuned-codet5-flutter/special_tokens_map.json',
 './fine-tuned-codet5-flutter/vocab.json',
 './fine-tuned-codet5-flutter/merges.txt',
 './fine-tuned-codet5-flutter/added_tokens.json',
 './fine-tuned-codet5-flutter/tokenizer.json')

In [9]:
!zip -r /kaggle/working/codeT5-flutter.zip /kaggle/working/codeT5-flutter

updating: kaggle/working/codeT5-flutter/ (stored 0%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/ (stored 0%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/model.safetensors (deflated 7%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/training_args.bin (deflated 52%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/optimizer.pt (deflated 8%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/scheduler.pt (deflated 56%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/generation_config.json (deflated 33%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/config.json (deflated 61%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/trainer_state.json (deflated 84%)
updating: kaggle/working/codeT5-flutter/checkpoint-50490/rng_state.pth (deflated 25%)
updating: kaggle/working/codeT5-flutter/checkpoint-33660/ (stored 0%)
updating: kaggle/working/codeT5-flutter/checkpoint-33660/model.safetensors (deflated 7%)
updating: kagg

In [13]:
!zip -r /kaggle/working/fine-tuned-codet5-flutter.zip /kaggle/working/fine-tuned-codet5-flutter

updating: kaggle/working/fine-tuned-codet5-flutter/ (stored 0%)
updating: kaggle/working/fine-tuned-codet5-flutter/model.safetensors (deflated 7%)
updating: kaggle/working/fine-tuned-codet5-flutter/vocab.json (deflated 59%)
updating: kaggle/working/fine-tuned-codet5-flutter/tokenizer_config.json (deflated 94%)
updating: kaggle/working/fine-tuned-codet5-flutter/generation_config.json (deflated 33%)
updating: kaggle/working/fine-tuned-codet5-flutter/tokenizer.json (deflated 82%)
updating: kaggle/working/fine-tuned-codet5-flutter/config.json (deflated 61%)
updating: kaggle/working/fine-tuned-codet5-flutter/merges.txt (deflated 54%)
updating: kaggle/working/fine-tuned-codet5-flutter/special_tokens_map.json (deflated 97%)
