In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()
    
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

In [None]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

In [None]:
%%writefile tokenization.py
import argparse
from transformers import AutoTokenizer

def tokenize_dialogue(dialogue, model_name='google/flan-t5-base'):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize input
    inputs = tokenizer(dialogue, return_tensors='pt')

    # Assuming output to file for next component's consumption (adjust as needed)
    with open("tokenized_text.pt", "wb") as f:
        torch.save(inputs, f)

def main():
    parser = argparse.ArgumentParser(description="Tokenize input dialogue")
    parser.add_argument("--dialogue", type=str, required=True, help="Input dialogue for tokenization")
    
    args = parser.parse_args()

    # Tokenize dialogue
    tokenize_dialogue(args.dialogue)

if __name__ == "__main__":
    main()


In [None]:
%%writefile sentiment_prediction.py
import argparse
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def predict_sentiment(tokenized_input_path, model_name='google/flan-t5-base'):
    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Load tokenized input
    with open(tokenized_input_path, "rb") as f:
        inputs = torch.load(f)

    # Generate prediction
    output = model.generate(inputs["input_ids"], max_new_tokens=50)
    
    # Load tokenizer to decode output
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(decoded_output)

def main():
    parser = argparse.ArgumentParser(description="Predict sentiment from tokenized input")
    parser.add_argument("--tokenized_input_path", type=str, required=True, help="Path to tokenized input file")
    
    args = parser.parse_args()

    # Predict sentiment
    predict_sentiment(args.tokenized_input_path)

if __name__ == "__main__":
    main()


In [None]:
%%writefile dialogue_summarization_peft.py
import argparse
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

def tokenize_and_prepare_data(dataset_name, model_name):
    """
    Tokenizes the dataset and prepares it for training and evaluation.
    """
    dataset = load_dataset(dataset_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(example):
        start_prompt = 'Summarize the following conversation.\n\n'
        end_prompt = '\n\nSummary: '
        prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
        example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
        example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
        return example

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])
    tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

    return tokenized_datasets

def fine_tune_with_peft(tokenized_datasets, model_name, output_dir, num_train_epochs, train_batch_size, eval_batch_size):
    """
    Fine-tunes the model using PEFT on the tokenized dataset.
    """
    original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    lora_config = LoraConfig(
        r=32,
        lora_alpha=32,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    peft_model = get_peft_model(original_model, lora_config)

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=1e-3,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"]
    )

    trainer.train()

def evaluate_model(dataset_name, model_name, tokenizer, num_samples=10):
    """
    Evaluates the model using a subset of the dataset and prints ROUGE scores.
    """
    dataset = load_dataset(dataset_name)
    dialogues = dataset['test'][:num_samples]['dialogue']
    human_baseline_summaries = dataset['test'][:num_samples]['summary']
    model_summaries = []

    for dialogue in dialogues:
        prompt = f"Summarize the following conversation.\n\n{dialogue}\n\nSummary: "
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        model_output = peft_model.generate(input_ids=input_ids, max_new_tokens=200)
        model_summary = tokenizer.decode(model_output[0], skip_special_tokens=True)
        model_summaries.append(model_summary)

    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=model_summaries, references=human_baseline_summaries, use_stemmer=True)

    print('MODEL ROUGE SCORES:')
    print(results)

def main():
    parser = argparse.ArgumentParser(description="Fine-tune and evaluate a dialogue summarization model with PEFT")
    parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name to use for training and evaluation")
    parser.add_argument("--model_name", type=str, default='google/flan-t5-base', help="Model name or path")
    parser.add_argument("--output_dir", type=str, default='./peft_model', help="Output directory for saving the model")
    parser.add_argument("--num_train_epochs", type=int, default=1, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Training batch size")
    parser.add_argument("--eval_batch_size", type=int, default=4, help="Evaluation batch size")
    parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to use for evaluation")

    args = parser.parse_args()

    tokenized_datasets = tokenize_and_prepare_data(args.dataset_name, args.model_name)
    fine_tune_with_peft(tokenized_datasets, args.model_name, args.output_dir, args.num_train_epochs, args.train_batch_size, args.eval_batch_size)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    evaluate_model(args.dataset_name, args.model_name, tokenizer, args.num_samples)

if __name__ == "__main__":
    main()


In [None]:
%%writefile tokenization.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: tokenization
display_name: Tokenization of Dialogue
version: 1
type: command
inputs:
  dialogue: 
    type: string
outputs:
  tokenized_text_path:
    type: uri_file
code: ./
environment: azureml:AzureML-pytorch-1.13-ubuntu20.04-py38-cpu@latest
command: >-
  python tokenization.py 
  --dialogue ${{inputs.dialogue}}


In [None]:
%%writefile sentiment_prediction.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: sentiment_prediction
display_name: Sentiment Prediction
version: 1
type: command
inputs:
  tokenized_input_path: 
    type: uri_file
outputs:
  sentiment_output:
    type: string
code: ./
environment: azureml:AzureML-pytorch-1.13-ubuntu20.04-py38-cpu@latest
command: >-
  python sentiment_prediction.py 
  --tokenized_input_path ${{inputs.tokenized_input_path}}


In [None]:
%%writefile dialogue_summarization_peft.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: dialogue_summarization_peft
display_name: Dialogue Summarization with PEFT
version: 1
type: command
inputs:
  dataset_name: 
    type: string
  model_name:
    type: string
    default: google/flan-t5-base
  output_dir:
    type: string
    default: ./peft_model
  num_train_epochs:
    type: integer
    default: 1
  train_batch_size:
    type: integer
    default: 4
  eval_batch_size:
    type: integer
    default: 4
  num_samples:
    type: integer
    default: 10
code: ./
environment: <your-custom-environment-name>
command: >-
  python dialogue_summarization_peft.py 
  --dataset_name ${{inputs.dataset_name}}
  --model_name ${{inputs.model_name}}
  --output_dir ${{inputs.output_dir}}
  --num_train_epochs ${{inputs.num_train_epochs}}
  --train_batch_size ${{inputs.train_batch_size}}
  --eval_batch_size ${{inputs.eval_batch_size}}
  --num_samples ${{inputs.num_samples}}
