### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from uuid import uuid4

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
step_func_role = "arn:aws:iam::{}:role/AmazonSageMaker-StepFunctionsWorkflowExecutionRole".format(account_id)
max_runs=1



In [2]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

In [3]:
jobs_imdb5h = [
 "hiddencut-imdb5h-05-2023-09-10-05-26-10-644",
"hiddencut-imdb5h-04-2023-09-10-05-26-08-989",
"hiddencut-imdb5h-03-2023-09-10-05-26-07-329",
"hiddencut-imdb5h-02-2023-09-10-05-26-04-684",
"hiddencut-imdb5h-01-2023-09-10-05-26-01-739",
"hiddencut-imdb5h-base-05-2023-09-10-05-25-59-853",
"hiddencut-imdb5h-base-04-2023-09-10-05-25-58-199",
"hiddencut-imdb5h-base-03-2023-09-10-05-25-56-549",
"hiddencut-imdb5h-base-02-2023-09-10-05-25-53-654",
"hiddencut-imdb5h-base-01-2023-09-10-05-25-50-657",
"hiddencut-hiddencut-imdb2k-base-05-2023-09-11-04-41-43-209",
"hiddencut-hiddencut-imdb2k-base-04-2023-09-11-04-41-41-355",
"hiddencut-hiddencut-imdb2k-base-03-2023-09-11-04-41-39-394",
"hiddencut-hiddencut-imdb2k-base-02-2023-09-11-04-41-36-540",
"hiddencut-hiddencut-imdb2k-base-01-2023-09-11-04-41-34-574",
"hiddencut-hiddencut-imdb2k-05-2023-09-12-01-14-41-067",
"hiddencut-hiddencut-imdb2k-04-2023-09-12-01-14-39-180",
"hiddencut-hiddencut-imdb2k-03-2023-09-12-01-14-36-090",
"hiddencut-hiddencut-imdb2k-02-2023-09-12-01-14-32-661",
"hiddencut-hiddencut-imdb2k-01-2023-09-12-01-14-28-395"
]

jobs_sst = ["hiddencut-sst-contrast-base-2023-09-10-18-46-14-367",
           "hiddencut-sst-contrast-2023-09-10-18-46-10-526"
           ]

jobs = jobs_sst

output_prefix = "s3://aegovan-data/hiddencut_sagemakerresults/{}/output/output.tar.gz"

        
    

In [4]:
import io

def get_bucketname_key(uripath):
    assert uripath.startswith("s3://")

    path_without_scheme = uripath[5:]
    bucket_end_index = path_without_scheme.find("/")

    bucket_name = path_without_scheme
    key = "/"
    if bucket_end_index > -1:
        bucket_name = path_without_scheme[0:bucket_end_index]
        key = path_without_scheme[bucket_end_index + 1:]

    return bucket_name, key

def download_binary_object(s3path):
    bucket, key = get_bucketname_key(s3path)

    s3 = boto3.client('s3')

    s3_response_object = s3.get_object(Bucket=bucket, Key=key)
    object_content = io.BytesIO(s3_response_object['Body'].read())
    
    return object_content


In [5]:
import tarfile, json


def extract_tar_json(fileobj):
    with tarfile.open(fileobj=fileobj) as f:
        for member in f:
            # You need additional code to save the data into a list.
            file_content_byte = f.extractfile(member.name)
            yield  json.loads(file_content_byte.read().decode("utf-8"))


In [6]:
import sagemaker

def get_training_job_details(job_name):
    client = boto3.client('sagemaker')
    response = client.describe_training_job(
    TrainingJobName=job_name
)
    input_data =  response["InputDataConfig"]
    is_augmentation = int(response["HyperParameters"].get("do_aug","0"))
    model_name = "roberta-base"
    if is_augmentation:
        model_name = model_name + "hidden-cut"
    train_data = list(filter(lambda x: x["ChannelName"].lower() == 'all', input_data ))[0]["DataSource"]["S3DataSource"]["S3Uri"]
    train_data = train_data.replace("s3://aegovan-data/glue_full_set/","")[:14]
    return train_data, model_name



In [7]:
import pandas as pd
pd.set_option('display.max_rows', 100)

def download_results(job_name):
    output_uri = output_prefix.format(job_name)
    print(output_uri)
    results =  list(extract_tar_json(download_binary_object(output_uri)))
    assert len(results) == 2, "Expect only 2 result files"
    df_1,df_2 = pd.DataFrame(results[0]), pd.DataFrame(results[1])
    df = df_1.merge(df_2, how='outer', on="step", left_index=False, right_index=False)
    df["output_uri"] = output_uri
    train_data, model = get_training_job_details(job_name)
    df["model"] = model
    df["train_data"] = train_data
            
    return df
    
def download_all_results(jobs):
    result = []
    for j in jobs:
        df = download_results(j)
        result.append(df)
    return pd.concat(result)
        
df_sst = download_all_results (jobs_sst)  
df_imdb = download_all_results (jobs_imdb5h)  

s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-sst-contrast-base-2023-09-10-18-46-14-367/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-sst-contrast-2023-09-10-18-46-10-526/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-05-2023-09-10-05-26-10-644/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-04-2023-09-10-05-26-08-989/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-03-2023-09-10-05-26-07-329/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-02-2023-09-10-05-26-04-684/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-01-2023-09-10-05-26-01-739/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-base-05-2023-09-10-05-25-59-853/output/output.tar.gz
s3://aegovan-data/hiddencut_sagemakerresults/hiddencut-imdb5h-base-04-2023-09-10-05-25-58-199/output/out

In [8]:
df_sst.head()

Unnamed: 0,step,constrastsetimdb_acc,constrastsetimdb_loss,constrastsetimdboriginal_acc,constrastsetimdboriginal_loss,eval_acc,eval_loss,output_uri,model,train_data
0,2650,0.907787,0.23636,0.922131,0.247688,0.949541,0.18016,s3://aegovan-data/hiddencut_sagemakerresults/h...,roberta-base,sst-2-contrast
1,50,,,,,0.490826,0.699888,s3://aegovan-data/hiddencut_sagemakerresults/h...,roberta-base,sst-2-contrast
2,100,,,,,0.490826,0.696311,s3://aegovan-data/hiddencut_sagemakerresults/h...,roberta-base,sst-2-contrast
3,150,,,,,0.490826,0.69269,s3://aegovan-data/hiddencut_sagemakerresults/h...,roberta-base,sst-2-contrast
4,200,,,,,0.509174,0.690357,s3://aegovan-data/hiddencut_sagemakerresults/h...,roberta-base,sst-2-contrast


In [9]:
df_sst[~df_sst["constrastsetimdb_acc"].isna()].T

Unnamed: 0,0,109
step,2650,5260
constrastsetimdb_acc,0.907787,0.918033
constrastsetimdb_loss,0.23636,0.252119
constrastsetimdboriginal_acc,0.922131,0.915984
constrastsetimdboriginal_loss,0.247688,0.316355
eval_acc,0.949541,0.951835
eval_loss,0.18016,0.181109
output_uri,s3://aegovan-data/hiddencut_sagemakerresults/h...,s3://aegovan-data/hiddencut_sagemakerresults/h...
model,roberta-base,roberta-basehidden-cut
train_data,sst-2-contrast,sst-2-contrast


In [10]:
df_sst["step"].describe()

count     267.000000
mean     3368.449438
std      2060.446607
min        50.000000
25%      1650.000000
50%      3200.000000
75%      4825.000000
max      7700.000000
Name: step, dtype: float64

In [11]:
df_sst["eval_acc"].describe()

count    267.000000
mean       0.920854
std        0.088231
min        0.490826
25%        0.935206
50%        0.941514
75%        0.944954
max        0.951835
Name: eval_acc, dtype: float64

In [12]:
df_imdb[~df_imdb["yelppolarity_acc"].isna()]\
    [["eval_acc", "yelppolarity_acc", "semeval4_acc", "amazonpolarity_acc", "model","train_data", "step"]]

Unnamed: 0,eval_acc,yelppolarity_acc,semeval4_acc,amazonpolarity_acc,model,train_data,step
13,0.92,0.906808,0.796177,0.879455,roberta-basehidden-cut,imdb/imdb-5h/2,91
0,0.933333,0.92184,0.875597,0.910238,roberta-basehidden-cut,imdb/imdb-5h/2,77
13,0.92,0.928684,0.872137,0.910285,roberta-basehidden-cut,imdb/imdb-5h/2,91
20,0.906667,0.925499,0.808535,0.90807,roberta-basehidden-cut,imdb/imdb-5h/2,133
15,0.933333,0.927789,0.857802,0.916748,roberta-basehidden-cut,imdb/imdb-5h/2,100
13,0.92,0.918839,0.843632,0.895077,roberta-base,imdb/imdb-5h/2,91
0,0.933333,0.926552,0.888449,0.91782,roberta-base,imdb/imdb-5h/2,98
13,0.92,0.931896,0.885484,0.91875,roberta-base,imdb/imdb-5h/2,91
0,0.88,0.926499,0.86637,0.91109,roberta-base,imdb/imdb-5h/2,119
0,0.933333,0.916496,0.858296,0.905293,roberta-base,imdb/imdb-5h/2,105


In [14]:
df_imdb[~df_imdb["yelppolarity_acc"].isna()]\
    [["eval_acc", "yelppolarity_acc", "semeval4_acc", 
      "amazonpolarity_acc", "model","train_data"]].groupby(["train_data","model"]).agg(("mean", "sem", "count"))

Unnamed: 0_level_0,Unnamed: 1_level_0,eval_acc,eval_acc,eval_acc,yelppolarity_acc,yelppolarity_acc,yelppolarity_acc,semeval4_acc,semeval4_acc,semeval4_acc,amazonpolarity_acc,amazonpolarity_acc,amazonpolarity_acc
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sem,count,mean,sem,count,mean,sem,count,mean,sem,count
train_data,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
imdb/imdb-2k/2,roberta-base,0.942667,0.004989,5,0.931154,0.00082,5,0.798023,0.013108,5,0.906235,0.00424,5
imdb/imdb-2k/2,roberta-basehidden-cut,0.945333,0.005925,5,0.930727,0.002539,5,0.792453,0.010771,5,0.905912,0.006415,5
imdb/imdb-5h/2,roberta-base,0.917333,0.009798,5,0.924056,0.002811,5,0.868446,0.008407,5,0.909606,0.004376,5
imdb/imdb-5h/2,roberta-basehidden-cut,0.922667,0.004989,5,0.922124,0.004007,5,0.84205,0.016593,5,0.904959,0.00654,5
