In [2]:
%pip install --upgrade sagemaker datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
from datasets import load_dataset

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [16]:
# split the dataset into training and evaluation set
train_dataset = dataset['train']
# randomize the training set
train_dataset = train_dataset.shuffle(seed=42)


eval_dataset = dataset['validation']

print('Number of training samples: ', len(train_dataset))
print('Number of evaluation samples: ', len(eval_dataset))

Number of training samples:  9543
Number of evaluation samples:  2388


In [18]:
LABEL = ["Bearish","Bullish","Neutral"]

In [19]:
prompt_text = "Please categorize the following Twitter financial news into one of these three categories: Bearish, Bullish, or Neutral" 
def json_dataset(dataset):
    record = []
    for index in range(len(dataset)):
        single_prompt_record = ({"instruction": prompt_text,"context": train_dataset[index]["text"], "response": LABEL[int(dataset[index]["label"])]})
        # Add this list as a new element in record
        record.append(single_prompt_record)
    return record

In [22]:
train_dataset[0]

{'text': 'Stocks - Tiffany, Disney, Tesla Rise Premarket; Uber Falls',
 'label': 2}

In [23]:
training_record = json_dataset(train_dataset)
training_record[0]

{'instruction': 'Please categorize the following Twitter financial news into one of these three categories: Bearish, Bullish, or Neutral',
 'context': 'Stocks - Tiffany, Disney, Tesla Rise Premarket; Uber Falls',
 'response': 'Neutral'}

In [26]:
import json

# Define a name for the output file
output_file_name = './data/jumpstart-training.jsonl'
# Use 'with' to ensure the file gets closed after writing
with open(output_file_name, 'w') as outfile:
    # Use json.dump to write pdfText to the file
    for entry in training_record:
        json.dump(entry,outfile)
        outfile.write('\n')

### Upload the dataset to S3


In [27]:
import boto3
import sagemaker

# create a sagemaker session
sagemaker_session = sagemaker.Session()

# get the default bucket
bucket = sagemaker_session.default_bucket() 

# specify the file name in S3 you want to upload
file_name = 'bedrock/jumpstart/fine-tuning' 

# specify the local path of the file you want to upload
local_path = './data/jumpstart-training.jsonl'

# upload the file to S3
sagemaker_session.upload_data(path=local_path, bucket=bucket, key_prefix=file_name)

print('Uploaded data to s3://{}/{}'.format(bucket, file_name))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Uploaded data to s3://sagemaker-us-east-1-707684582322/bedrock/jumpstart/fine-tuning


In [28]:
import json

template = {
    "prompt": "Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Input:\n{context}\n\n",
    "completion": " {response}",
}
with open("./data/template.json", "w") as f:
    json.dump(template, f)

In [29]:
model_id = "meta-textgeneration-llama-2-7b"
model_version = "3.*"


In [30]:
train_data_location = "s3://sagemaker-us-east-1-707684582322/bedrock/jumpstart/fine-tuning/jumpstart-training.jsonl"

In [31]:
from sagemaker.jumpstart.estimator import JumpStartEstimator

estimator = JumpStartEstimator(
    model_id=model_id,
    model_version=model_version,
    environment={"accept_eula": "true"},
    disable_output_compression=True,  # For Llama-2-70b, add instance_type = "ml.g5.48xlarge"
)
# By default, instruction tuning is set to false. Thus, to use instruction tuning dataset you use
estimator.set_hyperparameters(instruction_tuned="True", epoch="2", max_input_length="1024")

INFO:sagemaker.jumpstart:Model 'meta-textgeneration-llama-2-7b' requires accepting end-user license agreement (EULA). See https://jumpstart-cache-prod-us-east-1.s3.us-east-1.amazonaws.com/fmhMetadata/eula/llamaEula.txt for terms of use.
INFO:sagemaker.jumpstart:No instance type selected for training job. Defaulting to ml.g5.12xlarge.


In [32]:
estimator.fit({"training": train_data_location},wait=True)

INFO:sagemaker:Creating training-job with name: meta-textgeneration-llama-2-7b-2023-12-05-06-36-22-790


2023-12-05 06:36:22 Starting - Starting the training job...
2023-12-05 06:36:51 Starting - Preparing the instances for training..........................................
2023-12-05 06:43:35 Downloading - Downloading input data..................
2023-12-05 06:46:56 Training - Downloading the training image..................
2023-12-05 06:49:32 Training - Training image download completed. Training in progress.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-12-05 06:50:28,199 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-12-05 06:50:28,273 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-12-05 06:50:28,282 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-12-05 06:50:28,284 sagemaker_pytorch_container.training INFO     Invoking us

In [34]:
finetuned_predictor = estimator.deploy()

INFO:sagemaker.jumpstart:No instance type selected for inference hosting endpoint. Defaulting to ml.g5.2xlarge.
INFO:sagemaker.jumpstart:No instance type selected for inference hosting endpoint. Defaulting to ml.g5.2xlarge.
INFO:sagemaker:Creating model with name: meta-textgeneration-llama-2-7b-2023-12-05-07-40-44-908
INFO:sagemaker:Creating endpoint-config with name meta-textgeneration-llama-2-7b-2023-12-05-07-40-44-906
INFO:sagemaker:Creating endpoint with name meta-textgeneration-llama-2-7b-2023-12-05-07-40-44-906


-------!