In [None]:
!pip install datasets peft trl torch colorama tabulate pycryptodome pymupdf python-docx tqdm mpi4py
!pip install transformers --upgrade
!pip install deepspeed --upgrade

In [None]:
!pip install colorama

In [None]:
!pip uninstall -y tensorflow

In [None]:
!pip install tensorflow-cpu

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
#!/usr/bin/python3

# Loads Google Drive

import os
import shutil
from google.colab import drive
from colorama import Fore, Style, init
from IPython.display import display, HTML

# Initialize colorama
init()

# Access the API key from Colab Secrets
from google.colab import userdata

try:
    api_key = userdata.get('GOOGLE_DRIVE_API')
    print(f"{Fore.GREEN}✅ API Key loaded successfully.{Style.RESET_ALL}")

    # Set the API key as an environment variable
    os.environ['GOOGLE_DRIVE_API'] = api_key
except Exception as e:
    print(f"{Fore.RED}❌ Error loading API Key: {e}{Style.RESET_ALL}")
    api_key = None

# Mount Google Drive using the API key (if available)
if api_key:
    try:
        # Use the API key to authenticate and mount Google Drive
        drive.mount('/content/drive', force_remount=True)
        print(f"{Fore.GREEN}✅ Google Drive mounted successfully using API Key.{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}❌ Error mounting Google Drive: {e}{Style.RESET_ALL}")
else:
    print(f"{Fore.YELLOW}⚠️ No API Key found. Please log in manually.{Style.RESET_ALL}")
    drive.mount('/content/drive', force_remount=True)

# Define the Google Drive folder path
drive_folder = "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning"

# Define the Colab working directory
colab_folder = "/content/"

# Ensure the Colab folder exists
os.makedirs(colab_folder, exist_ok=True)

# Function to count files in a directory
def count_files(directory):
    return sum([len(files) for _, _, files in os.walk(directory)])

# Function to sync files from Colab to Google Drive
def sync_to_drive():
    try:
        print(f"{Fore.CYAN}🔄 Syncing files from Colab to Google Drive...{Style.RESET_ALL}")

        # Use rsync to copy files from Colab to Google Drive
        os.system(f"rsync -av --progress {colab_folder}/ {drive_folder}/")

        print(f"{Fore.GREEN}✅ Files synced from Colab to Google Drive.{Style.RESET_ALL}")
    except Exception as e:
        print(f"{Fore.RED}❌ Error syncing to Google Drive: {e}{Style.RESET_ALL}")

# Function to sync files from Google Drive to Colab
def sync_from_drive():
    try:
        print(f"{Fore.CYAN}🔄 Syncing files from Google Drive to Colab...{Style.RESET_ALL}")

        # Count files before sync
        initial_count = count_files(colab_folder)

        # Use rsync to copy files from Google Drive to Colab
        # -a: Archive mode (recursive, preserves permissions, symlinks, etc.)
        # -v: Verbose output
        # --progress: Show progress during transfer
        # --ignore-existing: Skip files that already exist in the destination
        # old command = os.system(f"rsync -av --progress {drive_folder}/ {colab_folder}/")
        os.system(f"rsync -av --progress {drive_folder} {colab_folder}/")
        # Count files after sync
        final_count = count_files(colab_folder)
        files_transferred = final_count - initial_count

        print(f"{Fore.GREEN}✅ Sync completed!{Style.RESET_ALL}")
        print(f"{Fore.GREEN}📂 Files transferred: {files_transferred}{Style.RESET_ALL}")

        # List files in the Colab folder to confirm sync
        print(f"{Fore.BLUE}📁 Files in Colab folder:{Style.RESET_ALL}")
        for root, dirs, files in os.walk(colab_folder):
            for file in files:
                print(os.path.join(root, file))
    except Exception as e:
        print(f"{Fore.RED}❌ Error syncing from Google Drive: {e}{Style.RESET_ALL}")

# Function to display a sync button
def display_sync_button():
    display(HTML('''
        <div>
            <button onclick="syncFromDrive()">Sync from Google Drive to Colab</button>
            <button onclick="syncToDrive()">Sync from Colab to Google Drive</button>
            <button onclick="refreshPage()">Refresh</button>
        </div>
        <script>
            function syncFromDrive() {
                google.colab.kernel.invokeFunction('sync_from_drive', [], {});
            }
            function syncToDrive() {
                google.colab.kernel.invokeFunction('sync_to_drive', [], {});
            }
            function refreshPage() {
                window.location.reload();
            }
        </script>
    '''))

# Register the sync functions with Colab
from google.colab import output
output.register_callback('sync_from_drive', sync_from_drive)
output.register_callback('sync_to_drive', sync_to_drive)


# Display the sync button
display_sync_button()

In [None]:
import sqlite3

# Your database code here
conn = sqlite3.connect('/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/db/data.db')
cursor = conn.cursor()

print("Connected to Database.")

In [None]:
import torch

# Clears any cache cuda used in last run

torch.cuda.empty_cache()

print("CUDA cache cleared.")

In [None]:
import transformers
import deepspeed

print(f"Transformers version: {transformers.__version__}")
print(f"DeepSpeed version: {deepspeed.__version__}")

In [None]:
#!/usr/bin/python3

# Loads Model from Google Drive

from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import Accelerator
import os
import torch

# Clear GPU memory
torch.cuda.empty_cache()

# Mount Google Drive
drive.mount('/content/drive')

# Define the model name and save directory
MODEL_NAME = "microsoft/phi-3-mini-4k-instruct"
DRIVE_MODEL_DIR = "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/models"

# Ensure the save directory exists
os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)

# Function to check if the model files exist in the directory
def model_files_exist(directory):
    required_files = [
        "tokenizer_config.json",
        "tokenizer.model",
        "tokenizer.json",
        "added_tokens.json",
        "special_tokens_map.json",
        "config.json",
        "model.safetensors.index.json",
        "generation_config.json",
    ]
    # Check for safetensors files (e.g., model-00001-of-00002.safetensors)
    safetensors_files = [f for f in os.listdir(directory) if f.startswith("model-") and f.endswith(".safetensors")]
    if not safetensors_files:
        return False
    # Check if all required files exist
    for file in required_files:
        if not os.path.exists(os.path.join(directory, file)):
            return False
    return True

# Function to download and save the model
def download_and_save_model():
    print(f"Downloading model: {MODEL_NAME}...")

    # Download the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

    # Save the tokenizer and model to Google Drive
    tokenizer.save_pretrained(DRIVE_MODEL_DIR)
    model.save_pretrained(DRIVE_MODEL_DIR)

    print(f"Model and tokenizer saved to Google Drive: {DRIVE_MODEL_DIR}")

# Check if the model is already downloaded
if not model_files_exist(DRIVE_MODEL_DIR):
    download_and_save_model()
else:
    print(f"Model already exists in Google Drive: {DRIVE_MODEL_DIR}")

# Load the tokenizer and model from the specified directory
print("Loading model and tokenizer from Google Drive...")
tokenizer = AutoTokenizer.from_pretrained(DRIVE_MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(DRIVE_MODEL_DIR)

print("Model and tokenizer loaded successfully!")

In [None]:
!nvidia-smi

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model from Google Drive
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/models")
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/models")

print("Model and tokenizer loaded from Google Drive.")

In [None]:
!rm -rf /content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/results/*

In [None]:
!rm -rf /root/.cache/torch_extensions
!rm -rf /root/.cache/huggingface
!rm -rf /root/.cache/deepspeed

In [None]:
#!/usr/bin/python3

# Loads Model from Google Drive and Starts Training on TPU.

import os
import json
import torch
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from colorama import Fore, Style, init
import re
import random
from packaging import version
from IPython.display import clear_output

# Initialize colorama
init()

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
MODEL_NAME = "microsoft/phi-3-mini-4k-instruct"
DRIVE_MODEL_DIR = "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/models"
OUTPUT_DIR = "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/results"
DATA_PATH = "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/db/exported_data.json"

# Ensure directories exist
os.makedirs(DRIVE_MODEL_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Install torch_xla for TPU support
print(f"{Fore.CYAN}Installing torch_xla for TPU support...{Style.RESET_ALL}")
!pip install cloud-tpu-client torch torchvision torch_xla
print(f"{Fore.GREEN}torch_xla installed successfully!{Style.RESET_ALL}")

# Import torch_xla
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Function to load the model with checkpointing
def load_model():
    print(f"{Fore.CYAN}Loading model from Google Drive...{Style.RESET_ALL}")
    if os.path.exists(DRIVE_MODEL_DIR):
        print(f"{Fore.GREEN}Model found in Google Drive. Loading...{Style.RESET_ALL}")
        tokenizer = AutoTokenizer.from_pretrained(DRIVE_MODEL_DIR)
        model = AutoModelForCausalLM.from_pretrained(DRIVE_MODEL_DIR)
    else:
        print(f"{Fore.YELLOW}Model not found in Google Drive. Downloading from Hugging Face...{Style.RESET_ALL}")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
        # Save the model to Google Drive for future use
        tokenizer.save_pretrained(DRIVE_MODEL_DIR)
        model.save_pretrained(DRIVE_MODEL_DIR)
        print(f"{Fore.GREEN}Model saved to Google Drive at {DRIVE_MODEL_DIR}.{Style.RESET_ALL}")

    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    return tokenizer, model

# Load the model and tokenizer
tokenizer, model = load_model()
if tokenizer is None or model is None:
    raise Exception("Failed to load the model.")

# Function to clean the text data
def clean_text(text):
    # Remove any non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load and preprocess the data
def load_and_preprocess_data(data_path, sample_size=None):  # Remove sample_size for full training
    print(f"{Fore.CYAN}Loading and preprocessing data...{Style.RESET_ALL}")
    with open(data_path, "r") as f:
        data = json.load(f)

    training_data = []
    for item in data:
        try:
            content = json.loads(item["content"])
            if "text" in content:
                cleaned_text = clean_text(content["text"])
                if cleaned_text:
                    training_data.append(cleaned_text)
            else:
                cleaned_text = clean_text(item["content"])
                if cleaned_text:
                    training_data.append(cleaned_text)
        except json.JSONDecodeError:
            cleaned_text = clean_text(item["content"])
            if cleaned_text:
                training_data.append(cleaned_text)

    # Shuffle the data
    random.shuffle(training_data)

    # Use full dataset for training
    if sample_size and len(training_data) > sample_size:
        training_data = training_data[:sample_size]

    print(f"{Fore.GREEN}Data loaded and preprocessed with {len(training_data)} samples.{Style.RESET_ALL}")
    return training_data

# Load and preprocess data
training_data = load_and_preprocess_data(DATA_PATH)  # Use full dataset

# Convert to a Hugging Face Dataset
dataset = Dataset.from_dict({"text": training_data})

# Tokenize the dataset with reduced sequence length
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)  # Increase max_length

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Add labels for causal language modeling
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,  # Adjust batch size for TPU
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=1,  # Log every step
    learning_rate=5e-5,  # Set learning rate explicitly
    adam_beta1=0.9,  # Set beta1 explicitly
    adam_beta2=0.999,  # Set beta2 explicitly
    report_to="none",  # Disable wandb logging
)

# Custom Trainer class for TPU training
class TPUTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.device = xm.xla_device()

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Check if labels are present before unpacking
        if "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None  # No labels means no loss is calculated

        outputs = model(**inputs)

        if labels is not None:
            # If labels are provided, calculate the loss
            loss_fct = torch.nn.CrossEntropyLoss()
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            # Get the vocabulary size from the model's configuration
            vocab_size = model.config.vocab_size

            # Debug: Print shapes and vocab size
            print(f"{Fore.BLUE}Shift logits shape: {shift_logits.shape}{Style.RESET_ALL}")
            print(f"{Fore.BLUE}Shift labels shape: {shift_labels.shape}{Style.RESET_ALL}")
            print(f"{Fore.BLUE}Vocab size: {vocab_size}{Style.RESET_ALL}")

            # Reshape logits and labels for loss calculation
            shift_logits = shift_logits.view(-1, vocab_size)
            shift_labels = shift_labels.view(-1)

            # Debug: Print reshaped shapes
            print(f"{Fore.BLUE}Reshaped shift logits shape: {shift_logits.shape}{Style.RESET_ALL}")
            print(f"{Fore.BLUE}Reshaped shift labels shape: {shift_labels.shape}{Style.RESET_ALL}")

            # Calculate loss
            loss = loss_fct(shift_logits, shift_labels)

            print(f"{Fore.GREEN}Loss calculated: {loss.item()}{Style.RESET_ALL}")  # Debug: Print Loss
        else:
            loss = outputs.loss  # If no labels provided, use original loss
            print(f"{Fore.YELLOW}No labels provided. Using original loss {loss.item()}{Style.RESET_ALL}")  # Debug: Print Loss

        return (loss, outputs) if return_outputs else loss

# Initialize the Trainer with the custom class
trainer = TPUTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Clear GPU cache before training
torch.cuda.empty_cache()

# Train the model with graceful interruption handling
try:
    print(f"{Fore.CYAN}=== Starting Training ==={Style.RESET_ALL}")
    trainer.train()
except KeyboardInterrupt:
    print(f"{Fore.YELLOW}Training interrupted by the user. Saving the model...{Style.RESET_ALL}")
    trainer.save_model(f"{OUTPUT_DIR}/interrupted_model")
    print(f"{Fore.GREEN}Model saved to '{OUTPUT_DIR}/interrupted_model'.{Style.RESET_ALL}")

In [None]:
from google.colab import files
import shutil

# Create a zip file of the fine-tuned model
shutil.make_archive("fine_tuned_phi3", 'zip', "/content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/model")

# Download the zip file
files.download("fine_tuned_phi3.zip")

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f"Your runtime has {ram_gb:.1f} gigabytes of available RAM")

!nvidia-smi

In [None]:
# prompt: A nice welcome banner

import colorama
from colorama import Fore, Style

colorama.init()

def print_banner():
  banner_text = r"""
  ██████╗ ██╗   ██╗██╗   ██╗███████╗██████╗
  ██╔══██╗██║   ██║██║   ██║██╔════╝██╔══██╗
  ██████╔╝██║   ██║██║   ██║█████╗  ██████╔╝
  ██╔═══╝ ██║   ██║╚██╗ ██╔╝██╔══╝  ██╔══██╗
  ██║     ╚██████╔╝ ╚████╔╝ ███████╗██║  ██║
  ╚═╝      ╚═════╝   ╚═══╝  ╚══════╝╚═╝  ╚═╝
  """

  print(Fore.GREEN + Style.BRIGHT + banner_text + Style.RESET_ALL)
  print(Fore.YELLOW + "Dependencies installed successfully!" + Style.RESET_ALL)


print_banner()

In [None]:
import os

# Create folders
os.makedirs("inputs", exist_ok=True)
os.makedirs("outputs/phi3_converted", exist_ok=True)
os.makedirs("db", exist_ok=True)
os.makedirs("db/backups", exist_ok=True)

print("Folder structure created.")

In [None]:
# Load a pre-trained model
model_name = "microsoft/phi-3-mini-4k-instruct"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Or, define your custom PyTorch model
# class MyModel(torch.nn.Module):
#     # ... your model definition ...
# model = MyModel()

In [None]:
!pip uninstall -y transformers
!pip uninstall -y deepspeed

In [None]:
!python /content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/training_script.py

In [None]:
!python /content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/database.py

In [None]:
pip install Crypto docx exceptions

In [None]:
!python /content/drive/MyDrive/Colab_Projects/Phi3_FineTuning/convert_to_phi3_unified.py