In [None]:
# Install necessary libraries
!pip install sentence-transformers datasets huggingface_hub pandas scikit-learn

In [None]:
# Install necessary libraries
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install sentence-transformers datasets huggingface_hub pandas scikit-learn accelerate transformers[torch] -U

In [None]:
!pip install accelerate -U

In [None]:
# Import required libraries
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login
import pandas as pd
import torch

# Ensure that PyTorch uses GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Load the fine-tuning dataset
df = pd.read_csv("ai_regulation_finetuning_dataset.csv")
print(df.head())

# Ensure the dataset has the necessary columns
assert 'question' in df.columns, "The dataset must have a 'question' column."
assert 'content' in df.columns, "The dataset must have a 'content' column."
assert 'score' in df.columns, "The dataset must have a 'score' column representing similarity scores."

# Convert 'score' to float
df['score'] = df['score'].astype(float)

# Split the dataset into train, validation, and test sets (80/10/10 split)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Prepare the dataset for SentenceTransformers
# Each InputExample includes texts and a label (similarity score)
train_samples = [InputExample(texts=[row['question'], row['content']], label=row['score']) for _, row in train_df.iterrows()]
val_samples = [InputExample(texts=[row['question'], row['content']], label=row['score']) for _, row in val_df.iterrows()]
test_samples = [InputExample(texts=[row['question'], row['content']], label=row['score']) for _, row in test_df.iterrows()]

# Load the pre-trained Snowflake model using SentenceTransformers
model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id, device=device)

# Create DataLoaders for training
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Define the loss function (Cosine Similarity Loss)
train_loss = losses.CosineSimilarityLoss(model=model)

# Define an evaluator for validation
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='val')

# Calculate warm-up steps
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of training steps

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluation_steps=100,  # Adjust based on dataset size
    output_path="./snowflake-arctic-embed-m-finetuned",
)

# Save the fine-tuned model locally
model.save("./snowflake-arctic-embed-m-finetuned")

# Evaluate the model on the test set
test_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='test')
test_evaluator(model, output_path="./snowflake-arctic-embed-m-finetuned")


In [None]:
# Log in to Hugging Face Hub
# notebook_login()

import getpass

import os

# Set the token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass('Enter your Hugging Face access token: ')




In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
# Push the fine-tuned model to the Hugging Face Hub
model.push_to_hub("gmedrano/snowflake-arctic-embed-m-finetuned")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

'https://huggingface.co/gmedrano/snowflake-arctic-embed-m-finetuned/commit/ef5dd989eebc5abbdb48b04229cb4685c5e66e8f'