# Step-by-Step Example: Fine-Tuning BERT on Azure ML


#### Step 1: Set Up Azure Machine Learning Workspace

1. **Create an Azure ML Workspace**:
   - Go to the [Azure Portal](https://ml.azure.com/).
   - Create a new resource group if you don't have one.
   - Search for "Machine Learning" and create a new Azure Machine Learning workspace.

2. **Install Azure ML SDK**:
   - Install the Azure ML SDK on your local machine or in an Azure ML Notebook.

In [None]:
pip install azureml-core azureml-sdk azureml-widgets

#### Step 2: Configure the Workspace

In [None]:
from azureml.core import Workspace

# Connect to the workspace
ws = Workspace.from_config()

# Print workspace details
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

#### Step 3: Prepare the Data

1. **Upload Data to Azure Blob Storage**:
   - Upload your dataset (e.g., CSV file) to Azure Blob Storage.

In [None]:
from azureml.core import Datastore, Dataset

# Get the default datastore
datastore = ws.get_default_datastore()

# Upload the dataset
datastore.upload(src_dir='data/', target_path='datasets/', overwrite=True)

# Create a dataset
dataset = Dataset.Tabular.from_delimited_files(path=(datastore, 'datasets/your_dataset.csv'))

#### Step 4: Create a Compute Cluster

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Define the compute cluster
compute_name = "cpu-cluster"
compute_min_nodes = 0
compute_max_nodes = 4
vm_size = "STANDARD_D2_V2"

# Create the compute cluster
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print("Found existing compute target.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           min_nodes=compute_min_nodes,
                                                           max_nodes=compute_max_nodes)
    compute_target = ComputeTarget.create(ws, compute_name, compute_config)

compute_target.wait_for_completion(show_output=True)

#### Step 5: Define the Training Script

In [None]:
# train.py
import argparse
import os
import pandas as pd
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer

def main(args):
    # Load dataset
    df = pd.read_csv(args.data_path)
    texts = df['text'].tolist()
    labels = df['label'].tolist()

    # Tokenize data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encodings = tokenizer(texts, truncation=True, padding=True)

    # Create dataset
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    dataset = Dataset(encodings, labels)

    # Load model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Train model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    trainer.train()

    # Save model
    model.save_pretrained(args.output_dir)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', type=str, help='Path to the training data')
    parser.add_argument('--output_dir', type=str, help='Path to save the trained model')
    args = parser.parse_args()
    main(args)

#### Step 6: Create an Environment

In [None]:
from azureml.core import Environment

# Create an environment
env = Environment.from_conda_specification(name='bert-env', file_path='environment.yml')

# environment.yml
name: bert-env
channels:
  - defaults
dependencies:
  - python=3.8
  - pip:
    - transformers
    - torch
    - pandas
    - scikit-learn

#### Step 7: Submit the Training Job

In [None]:
from azureml.core import ScriptRunConfig, Experiment

# Create a script run configuration
src = ScriptRunConfig(source_directory='./scripts',
                      script='train.py',
                      arguments=['--data_path', dataset.as_named_input('input').as_mount(),
                                 '--output_dir', './outputs'],
                      compute_target=compute_target,
                      environment=env)

# Create an experiment
experiment = Experiment(workspace=ws, name='bert-fine-tuning')

# Submit the experiment
run = experiment.submit(src)
run.wait_for_completion(show_output=True)

#### Step 8: Deploy the Model

In [None]:
from azureml.core.model import Model
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import InferenceConfig

# Register the model
model = Model.register(workspace=ws, model_name='bert-model', model_path='./outputs')

# Define inference configuration
inference_config = InferenceConfig(entry_script='score.py', environment=env)

# Define deployment configuration
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model
service = Model.deploy(workspace=ws,
                       name='bert-service',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config)
service.wait_for_deployment(show_output=True)

print(service.scoring_uri)

#### Step 9: Create the Scoring Script

Create a scoring script (`score.py`) for the deployed model.

In [None]:
# score.py
import json
import torch
from transformers import BertForSequenceClassification, BertTokenizer

def init():
    global model
    global tokenizer
    model_path = Model.get_model_path('bert-model')
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def run(data):
    try:
        inputs = json.loads(data)
        texts = inputs['texts']
        encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=1).tolist()
        return json.dumps(predictions)
    except Exception as e:
        error = str(e)
        return json.dumps({"error": error})

## Summary

This step-by-step example demonstrates how to fine-tune a BERT model for text classification using Azure Machine Learning. It covers setting up the environment, preparing the data, training the model, and deploying the model as a web service.