In [1]:
import os
import psutil
import kaggle
import tensorflow as tf
from itertools import chain
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import AutoTokenizer, TFAutoModelForCausalLM, DataCollatorWithPadding

BATCH_SIZE = 8
NUM_EPOCHS = 3
BLOCK_SIZE = 512
CPU_COUNT = psutil.cpu_count()
MODEL_CHECKPOINT = 'distilgpt2'
KAGGLE_DS_DIR = 'kaggle_dataset'

print("Tensorflow verions:", tf.__version__)
print('Number of CPUs:', CPU_COUNT)
print('Available GPUs:', tf.config.experimental.list_physical_devices('GPU'))

  from .autonotebook import tqdm as notebook_tqdm


Tensorflow verions: 2.4.0
Number of CPUs: 8
Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
!huggingface-cli whoami

cuneyttyler


In [5]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)

OSError: meta-llama/Llama-2-7b-chat-hf is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [9]:
MODEL_PATH = "D:\Dev\Jupyter Projects\models\llama-2-7b-chat.Q3_K_S.gguf"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: 'D:\Dev\Jupyter Projects\models\llama-2-7b-chat.Q3_K_S.gguf'.

In [None]:
# download dataset if it doesn't exists
if not os.path.exists(KAGGLE_DS_DIR):
    kaggle.api.dataset_download_files(
        'simiotic/github-code-snippets-development-sample', path=KAGGLE_DS_DIR, unzip=True)

# load raw dataset from sqlite3
raw_dataset = load_dataset('./sql_loading_script.py')
if "validation" not in raw_dataset.keys():
    raw_dataset["validation"] = load_dataset(
        './sql_loading_script.py',
        split=f"train[:5%]",
    )
    raw_dataset["train"] = load_dataset(
        './sql_loading_script.py',
        split=f"train[5%:]",
    )

# initiate tokenizer and model on cuda
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

tokenizer.pad_token = tokenizer.eos_token
max_seq_length = tokenizer.model_max_length

model = TFAutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

model.resize_token_embeddings(len(tokenizer))


def tokenize_funcion(examples):
    return tokenizer(examples['text'], truncation=True)


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {
        k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + max_seq_length]
            for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result


# tokenize the raw dataset
tokenized_ds = raw_dataset.map(tokenize_funcion, batched=True, num_proc=CPU_COUNT, remove_columns=["text"])
tokenized_ds = tokenized_ds.map(group_texts, batched=True, num_proc=CPU_COUNT)

# convert training dataset to tf dataset
tf_ds = tokenized_ds['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'token_type_ids'],
	label_cols=["labels"],
    # columns=[col for col in tokenized_ds['train'].features if col != "special_tokens_mask"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
	drop_remainder=True,
)

eval_ds = tokenized_ds['validation'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'token_type_ids'],
	label_cols=["labels"],
    # columns=[col for col in tokenized_ds['validation'].features if col != "special_tokens_mask"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
	drop_remainder=True,
)

num_train_steps = len(tf_ds) * NUM_EPOCHS
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.0,
    decay_steps=num_train_steps,
)
opt = Adam(learning_rate=lr_scheduler)

model.compile(
	optimizer=opt,
	loss=SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
	metrics=['accuracy'],
)

model.fit(tf_ds, validation_data=eval_ds, epochs=NUM_EPOCHS, steps_per_epoch=len(tf_ds) // BATCH_SIZE)