In [None]:
!pip install transformers datasets

In [None]:
!pip install datasets
!pip install transformers[torch]

In [None]:
!pip install sentencepiece
!pip install accelerate>=0.20.1

In [None]:
import os
import logging
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from contextlib import suppress


# Setup logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Define parameters
MODEL_CKPT = os.getenv('MODEL_CKPT', 'microsoft/Orca-2-13b')
DATASET_NAME_OR_PATH = os.getenv('DATASET_NAME_OR_PATH', 'Open-Orca/OpenOrca')
OUTPUT_DIR = os.getenv('OUTPUT_DIR', './cdac_output')
TRAIN_BATCH_SIZE = int(os.getenv('TRAIN_BATCH_SIZE', '4'))
NUM_EPOCHS = float(os.getenv('NUM_EPOCHS', '3'))
LEARNING_RATE = float(os.getenv('LEARNING_RATE', '5e-5'))
WEIGHT_DECAY = float(os.getenv('WEIGHT_DECAY', '0.01'))
MAX_GRAD_NORM = float(os.getenv('MAX_GRAD_NORM', '1.0'))
LOGGING_STRATEGY = os.getenv('LOGGING_STRATEGY', 'steps') # or 'epoch'
LOGGING_STEPS = int(os.getenv('LOGGING_STEPS', '500'))
SAVE_STRATEGY = os.getenv('SAVE_STRATEGY', 'steps') # or 'epoch'
SAVE_STEPS = int(os.getenv('SAVE_STEPS', '1000'))
SAVE_TOTAL_LIMIT = int(os.getenv('SAVE_TOTAL_LIMIT', '2'))
WARMUP_STEPS = int(os.getenv('WARMUP_STEPS', '0'))
WARMUP_RATIO = float(os.getenv('WARMUP_RATIO', '0.0'))
LR_SCHEDULER_TYPE = os.getenv('LR_SCHEDULER_TYPE', 'linear')
REMOVE_UNUSED_COLUMNS = (os.getenv('REMOVE_UNUSED_COLUMNS', 'True') == 'True')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function definitions remain mostly the same
def load_and_prepare_dataset(dataset_name_or_path):
    try:
        dataset = load_dataset(dataset_name_or_path)
        if 'train' not in dataset:
            raise ValueError(f"'train' split does not exist in the dataset {dataset_name_or_path}")
        return dataset
    except Exception as e:
        logger.error(f"Error loading dataset {dataset_name_or_path}: {e}")
        raise


def create_training_args(output_dir, **kwargs):
    return TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=False,
        do_predict=False,
        evaluation_strategy='no',  # Change as needed
        per_device_train_batch_size=kwargs['train_batch_size'],
        per_device_eval_batch_size=kwargs.get('eval_batch_size', kwargs['train_batch_size']),
        learning_rate=kwargs['learning_rate'],
        weight_decay=kwargs['weight_decay'],
        max_grad_norm=kwargs['max_grad_norm'],
        logging_strategy=kwargs['logging_strategy'],
        logging_steps=kwargs['logging_steps'],
        save_strategy=kwargs['save_strategy'],
        save_steps=kwargs['save_steps'],
        num_train_epochs=kwargs['num_epochs'],
        save_total_limit=kwargs['save_total_limit'],
        warmup_steps=kwargs['warmup_steps'],
        warmup_ratio=kwargs['warmup_ratio'],
        lr_scheduler_type=kwargs['lr_scheduler_type'],
        remove_unused_columns=kwargs['remove_unused_columns'],
        # Additional args here
        fp16=True #if you want to use mixed precision
    )

def train_model(dataset, model_ckpt, training_args):
    with suppress(FileNotFoundError):
        model = AutoModelForCausalLM.from_pretrained(model_ckpt)
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train']
        )
        trainer.train()
        model.save_pretrained(OUTPUT_DIR)
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        tokenizer.save_pretrained(OUTPUT_DIR)

if __name__ == '__main__':
    try:
        dataset = load_and_prepare_dataset(DATASET_NAME_OR_PATH)
        training_args = create_training_args(
        OUTPUT_DIR,
        train_batch_size=TRAIN_BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        logging_steps=LOGGING_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=SAVE_TOTAL_LIMIT,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        max_grad_norm=MAX_GRAD_NORM,
        logging_strategy=LOGGING_STRATEGY,
        save_strategy=SAVE_STRATEGY,
        warmup_steps=WARMUP_STEPS,
        warmup_ratio=WARMUP_RATIO,
        lr_scheduler_type=LR_SCHEDULER_TYPE,
        remove_unused_columns=REMOVE_UNUSED_COLUMNS
        )
        train_model(dataset, MODEL_CKPT, training_args)

        tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
        model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)

        def chat_with_model(prompt):
            inputs = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
            outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
            return tokenizer.decode(outputs[0], skip_special_tokens=True)

        logger.info("Chat with the model (type 'quit' to exit).")
        while True:
            user_input = input('User: ')
            if user_input.lower() == 'quit':
                logger.info("Exiting chat.")
                break
            try:
                response = chat_with_model(user_input)
                print(f'Model: {response}')
            except Exception as e:
                logger.error(f"Error during chat response generation: {e}")

    except Exception as e:
        logger.error(f"Fatal error: {e}")

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate

In [None]:
!pip install sentencepiece


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/Orca-2-13b')

# Example dataset
dataset = load_dataset('Open-Orca/OpenOrca', split='train')

# Tokenize and encode the text features
tokenized = tokenizer(
    dataset['system_prompt'],
    dataset['question'],
    dataset['response'],
    padding='max_length',  # Pad to the maximum sequence length
    truncation=True,  # Truncate sequences longer than max_length
    max_length=512,  # Set your desired maximum sequence length
    return_tensors='pt'  # Return PyTorch tensors
)

# Select input and output features
input_ids = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
labels = input_ids.clone()  # Example: Making labels same as input for language modeling

# Your further processing or training code using these preprocessed inputs
