In [2]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Note: you may need to restart the kernel to use updated packages.
Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()
    
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [13]:
import os

# create a folder for the script files
script_folder = '../src'
output_folder = '../output'
os.makedirs(script_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)
print(script_folder, 'folder created')

../src folder created


In [16]:
%%writefile $script_folder/sentiment_prediction.py
import mlflow
import argparse
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def predict_sentiment(dialogue, output_path, model_name='google/flan-t5-base'):
    # Initialize tokeniazer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Constructing a 5-shot prompt with examples
    start_prompt = '''Provide Sentiment for the following comment/conversation (possible sentiments: Positive, Negative, Neutral):

    Comment: "I love sunny days, they make me feel so happy!"
    Sentiment: Positive

    Comment: "This is the worst experience of my life, I'm so disappointed."
    Sentiment: Negative

    Comment: "I'm not sure how I feel about this new policy. It might be good or bad."
    Sentiment: Neutral

    Comment: "The service at this restaurant was fantastic, best dinner ever!"
    Sentiment: Positive

    Comment: "I waited for an hour and my order was still wrong."
    Sentiment: Negative

    Comment: '''
    
    end_prompt = '\nSentiment: '
    
    # Construct the full prompt with the user-provided dialogue
    prompt = start_prompt + '"' + dialogue + '"' + end_prompt 

    # Tokenize input dialogue
    inputs = tokenizer(prompt, return_tensors='pt')

    # Generate prediction
    output = model.generate(inputs['input_ids'], max_new_tokens=50)
    
    # Decode and print the prediction
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Write the predicted sentiment to the specified output file
    with open(output_path, 'w') as f:
        f.write('Text: ' + dialogue + '\nPredicted Sentiment: ' + decoded_output + '\n')

def main():
    # enable autologging
    mlflow.autolog()
    
    parser = argparse.ArgumentParser(description="Predict sentiment from input dialogue")
    parser.add_argument("--dialogue", type=str, required=True, help="Input dialogue for sentiment prediction")
    parser.add_argument("--output", type=str, required=True, help="Output file path for sentiment prediction")
    
    args = parser.parse_args()

    # Predict sentiment and write to output
    predict_sentiment(args.dialogue, args.output)

if __name__ == "__main__":
    main()


Overwriting ../src/sentiment_prediction.py


In [4]:
!python $script_folder/sentiment_prediction.py --dialogue "I love this book!" --output $output_folder"/output.txt"


2024-02-24 06:28:48.480926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-24 06:28:52.141555: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-24 06:28:53.196121: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-02-24 06:28:53.196169: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [14]:
%%writefile $script_folder/dialogue_summarization_peft.py
import mlflow
import argparse
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

def tokenize_and_prepare_data(dataset_name, model_name):
    """
    Tokenizes the dataset and prepares it for training and evaluation.
    """
    dataset = load_dataset(dataset_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(example):
        start_prompt = 'Summarize the following conversation.\n\n'
        end_prompt = '\n\nSummary: '
        prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
        example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
        example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
        return example

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])
    tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

    return tokenized_datasets

def fine_tune_with_peft(tokenized_datasets, model_name, output_dir, num_train_epochs, train_batch_size, eval_batch_size, learning_rate, lora_r, lora_alpha, lora_dropout):
    """
    Fine-tunes the model using PEFT on the tokenized dataset.
    """
    original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=["q", "v"],
        lora_dropout=lora_dropout,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    peft_model = get_peft_model(original_model, lora_config)

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"]
    )

    trainer.train()

    # Log training parameters
    training_params = {
        "learning_rate": learning_rate,
        "lora_r": lora_r,
        "lora_alpha": lora_alpha,
        "lora_dropout": lora_dropout,
        "num_train_epochs": num_train_epochs,
        "train_batch_size": train_batch_size,
        "eval_batch_size": eval_batch_size
    }
    mlflow.log_params(training_params)

def evaluate_model(dataset_name, model_name, num_samples=10):
    """
    Evaluates the model using a subset of the dataset and prints ROUGE scores.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    dataset = load_dataset(dataset_name)
    dialogues = dataset['test'][:num_samples]['dialogue']
    human_baseline_summaries = dataset['test'][:num_samples]['summary']
    model_summaries = []

    for dialogue in dialogues:
        prompt = f"Summarize the following conversation.\n\n{dialogue}\n\nSummary: "
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        model_output = model.generate(input_ids=input_ids, max_new_tokens=200)
        model_summary = tokenizer.decode(model_output[0], skip_special_tokens=True)
        model_summaries.append(model_summary)

    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=model_summaries, references=human_baseline_summaries, use_stemmer=True)

    print('MODEL ROUGE SCORES:')
    print(results)

    # Log evaluation parameters
    evaluation_params = {
        "evaluation_num_samples": num_samples
    }
    mlflow.log_params(evaluation_params)

def main():
    # enable autologging
    mlflow.autolog()

    parser = argparse.ArgumentParser(description="Fine-tune and evaluate a dialogue summarization model with PEFT")
    parser.add_argument("--dataset_name", type=str, default='knkarthick/dialogsum', help="Dataset name to use for training and evaluation")
    parser.add_argument("--model_name", type=str, default='google/flan-t5-base', help="Model name or path")
    parser.add_argument("--output_dir", type=str, default='./peft_model', help="Output directory for saving the model")
    parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate for training")
    parser.add_argument("--lora_r", type=int, default=32, help="Rank of LoRA")
    parser.add_argument("--lora_alpha", type=int, default=32, help="Scale parameter for LoRA")
    parser.add_argument("--lora_dropout", type=float, default=0.05, help="Dropout rate for LoRA layers")
    parser.add_argument("--num_train_epochs", type=int, default=1, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Training batch size")
    parser.add_argument("--eval_batch_size", type=int, default=4, help="Evaluation batch size")
    parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to use for evaluation")

    args = parser.parse_args()

    tokenized_datasets = tokenize_and_prepare_data(args.dataset_name, args.model_name)
    fine_tune_with_peft(tokenized_datasets, args.model_name, args.output_dir, args.num_train_epochs, args.train_batch_size, args.eval_batch_size, args.learning_rate, args.lora_r, args.lora_alpha, args.lora_dropout)
    evaluate_model(args.dataset_name, args.model_name, args.num_samples)

if __name__ == "__main__":
    main()


Overwriting ../src/dialogue_summarization_peft.py


In [None]:
from azure.ai.ml.entities import ComputeInstance


ci = ComputeInstance(
    name="compute-instance", 
    size="Standard_E4ds_v4"
)
ml_client.begin_create_or_update(ci).result()

In [9]:
%%writefile ../sentiment_prediction.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: sentiment_prediction_merged
display_name: Sentiment Prediction with Integrated Tokenization
version: 3
type: command
inputs:
  dialogue: 
    type: string
outputs:
  sentiment_output:
    type: uri_file
code: ./src
environment: azureml:AzureML-ACPT-pytorch-1.13-py38-cuda11.7-gpu@latest
compute: azureml:cpu-cluster
command: >-
  python sentiment_prediction.py 
  --dialogue ${{inputs.dialogue}}
  --output ${{outputs.sentiment_output}}


Overwriting ../sentiment_prediction.yml


In [15]:
%%writefile ../dialogue_summarization_peft.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: dialogue_summarization_peft
display_name: Dialogue Summarization with PEFT
version: 1
type: command
inputs:
  dataset_name: 
    type: string
  model_name:
    type: string
    default: google/flan-t5-base
  output_dir:
    type: string
    default: ./peft_model
  learning_rate:
    type: number
    default: 0.001
  lora_r:
    type: integer
    default: 32
  lora_alpha:
    type: integer
    default: 32
  lora_dropout:
    type: number
    default: 0.05
  num_train_epochs:
    type: integer
    default: 1
  train_batch_size:
    type: integer
    default: 4
  eval_batch_size:
    type: integer
    default: 4
  num_samples:
    type: integer
    default: 10
code: ./
environment: AzureML-ACPT-pytorch-1.13-py38-cuda11.7-gpu@latest
command: >-
  python dialogue_summarization_peft.py 
  --dataset_name ${{inputs.dataset_name}}
  --model_name ${{inputs.model_name}}
  --output_dir ${{inputs.output_dir}}
  --learning_rate ${{inputs.learning_rate}}
  --lora_r ${{inputs.lora_r}}
  --lora_alpha ${{inputs.lora_alpha}}
  --lora_dropout ${{inputs.lora_dropout}}
  --num_train_epochs ${{inputs.num_train_epochs}}
  --train_batch_size ${{inputs.train_batch_size}}
  --eval_batch_size ${{inputs.eval_batch_size}}
  --num_samples ${{inputs.num_samples}}


Overwriting ../dialogue_summarization_peft.yml


In [6]:
from azure.ai.ml import load_component
parent_dir = ""

predict_sentiment_segment = load_component(source=parent_dir + "../sentiment_prediction.yml")

# register component
prep = ml_client.components.create_or_update(predict_sentiment_segment, version='2')

In [11]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

@pipeline()
def sentiment_prediction(pipeline_job_input):
    sentiment = predict_sentiment_segment(dialogue=pipeline_job_input)

    return {
        "pipeline_job_predict_sentiment_data": sentiment.outputs.sentiment_output,
        
    }

# Example usage of the pipeline with a direct string input for the dialogue
pipeline_job = sentiment_prediction(pipeline_job_input="This is an example dialogue text.")
pipeline_job.settings.default_compute = "compute-instance"

In [12]:
# submit job to workspace
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="sentiment_prediction"
)
pipeline_job

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Experiment,Name,Type,Status,Details Page
sentiment_prediction,sharp_fennel_82r1k1c1x6,pipeline,Preparing,Link to Azure Machine Learning studio


In [None]:
# #CLI2 version of creating component and pipeline
# !az extension add --name ml -y
# output = %sx az ml component list \
#         --resource-group "cloud-shell-storage-southeastasia" \
#         --workspace-name "oksana_ml"
# print(output)
# !az ml component create --file ../sentiment_prediction.yml
# !az ml job create --file ../pipeline_sentiment_prediction.yml