### Create prompt dataset

In [2]:
! pip install -U sagemaker

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
! pip install transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data-distributed-qanda-alt/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
for line in Lines1[:20000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append("Prompt: "+row["question"])
        answers.append("Response: "+answer)
    for answer in row["chatgpt_answers"]:
        questions.append("Prompt: "+row["question"])
        answers.append("Response: "+answer)

test_file = open("data-distributed-qanda-alt/test.jsonl","w")
for line in Lines1[20000:]:
    test_file.write(line)

test_file.close()


df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:60000,:]
df_val = df.iloc[60000:,:]

df_train.to_csv("data-distributed-qanda-alt/train.csv", index=False)
df_val.to_csv("data-distributed-qanda-alt/val.csv", index=False)

### Upload data to S3

In [5]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [6]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::079002598131:role/service-role/AmazonSageMaker-ExecutionRole-20220804T150518
sagemaker bucket: sagemaker-us-east-1-079002598131
sagemaker session region: us-east-1


In [7]:
train_data_url = sess.upload_data(
    path="data-distributed-qanda-alt/train.csv",
    key_prefix="promptsds",
)

valid_data_url = sess.upload_data(
    path="data-distributed-qanda-alt/val.csv",
    key_prefix="promptsds",
)

In [8]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

training file path s3://sagemaker-us-east-1-079002598131/promptsds/train.csv
validation file path s3://sagemaker-us-east-1-079002598131/promptsds/val.csv


### Fine Tune FLAN T5 XXL (11b) on Seq2Seq

In [9]:
base_job_name="sft-flan-t5-11b"

In [10]:
hyperparameters = {}

hyperparameters["model_name_or_path"] = "google/flan-t5-xxl"
hyperparameters["train_file"] = "/opt/ml/input/data/train/train.csv"
hyperparameters["validation_file"] = "/opt/ml/input/data/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 1
hyperparameters["per_device_eval_batch_size"] = 1
hyperparameters["block_size"] = 512
hyperparameters["model_dir"] = "/opt/ml/model"
hyperparameters["num_train_epochs"] = 1
hyperparameters["max_train_steps"] = 100

In [11]:
smp_options = {
    "enabled":True,
    "parameters": {                        # Required
        "pipeline_parallel_degree": 1,     # Required
        "ddp": True,
        # parameters for sharded data parallelism
        "sharded_data_parallel_degree": 16,              # Add this to activate sharded data parallelism
        "partitions":1,
        "offload_activations":True,
        "delayed_parameter_initialization":True,
        "sdp_reduce_bucket_size": int(5e8),             # Optional
        "sdp_param_persistence_threshold": int(1e6),    # Optional
        "sdp_max_live_parameters": int(1e9),            # Optional
        "sdp_hierarchical_allgather": True,             # Optional
        "sdp_gradient_clipping": 1.0,
        "bf16":True,
        "skip_tracing": True

    }
}

mpi_options = {
    "enabled" : True,                      # Required
    "processes_per_host" : 8               # Required
}

In [12]:
# launch with smp

estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="src-distributed-qanda-alt/",
    entry_point="train.py",
    role=role,
    framework_version="1.13.1",
    py_version="py39", 
    instance_count=2,
    instance_type="ml.p4d.24xlarge",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    debugger_hook_config=False,
 
    distribution={
        "smdistributed": {"modelparallel": smp_options},
        "mpi": mpi_options
    }
)

In [None]:
estimator.fit({"train":train_data_url,"valid":valid_data_url}, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sft-flan-t5-11b-2023-04-29-05-59-22-119


2023-04-29 05:59:22 Starting - Starting the training job......
2023-04-29 06:00:11 Starting - Preparing the instances for training.........
2023-04-29 06:01:51 Downloading - Downloading input data...
2023-04-29 06:02:06 Training - Downloading the training image............
2023-04-29 06:04:02 Training - Training image download completed. Training in progress..