# Get Data Ready

In [1]:
#!pip install gdown

In [2]:
#https://drive.google.com/file/d/1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap/view?usp=sharing
!gdown --id 1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap
#https://drive.google.com/file/d/1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl/view?usp=sharing
!gdown --id 1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl

Downloading...
From: https://drive.google.com/uc?id=1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap
To: /home/ec2-user/SageMaker/danmu_token_main.pkl
100%|███████████████████████████████████████| 5.38M/5.38M [00:00<00:00, 196MB/s]
Downloading...
From: https://drive.google.com/uc?id=1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl
To: /home/ec2-user/SageMaker/danmu_dist_main.pkl
100%|███████████████████████████████████████| 56.7M/56.7M [00:00<00:00, 244MB/s]


In [3]:
import pickle

token_file = open("danmu_token_main.pkl", "rb")
dist_file = open("danmu_dist_main.pkl", "rb")

tokens = pickle.load(token_file)
dists = pickle.load(dist_file)

print(len(tokens))
print(len(dists))

337392
337392


In [4]:
import numpy as np
labels = [np.argmax(i) for i in dists]

label_distribution = {}
for i in labels:
    if i in label_distribution:
        label_distribution[i] += 1
    else:
        label_distribution[i] = 1

In [5]:
len(label_distribution)

19

In [6]:
# remove label with no data
label_list = list(label_distribution.keys())
labels = [label_list.index(i) for i in labels]

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
    'text': tokens,
    'label': labels
})

train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.1)

In [8]:
#Debugging
train_df = train_df[train_df['text'] != 'nan']
test_df = test_df[test_df['text'] != 'nan']
val_df = val_df[val_df['text'] != 'nan']


In [9]:
train_df.reset_index(drop=True)
test_df.reset_index(drop=True)
val_df.reset_index(drop=True)

Unnamed: 0,text,label
0,握手会,2
1,瞎折腾,9
2,我用爷打过的,0
3,我dd我dd,0
4,混沌与秩序,2
...,...,...
26986,负二贷,6
26987,就离大谱,13
26988,一副骄傲模样,4
26989,姐妹们抢啊,14


In [8]:
#save to csv

train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

train_df.to_csv('data_csv/train.csv', index = False)
test_df.to_csv('data_csv/test.csv', index = False)
val_df.to_csv('data_csv/val.csv', index = False)

# train_dict = train_df.to_dict()
# test_dict = test_df.to_dict()
# val_dict = val_df.to_dict()
# train_data_json = {
#     'train' : train_dict,
#     'test' : test_dict,
#     'validation': val_dict
# }

# import json

# with open('data_csv/data.json', 'w') as f:
#     json.dump(train_data_json, f)
    

# SageMaker

In [10]:
import sagemaker

session = sagemaker.Session()

session_bucket = None
if session_bucket == None and session is not None:
    session_bucket = session.default_bucket()
    
role = sagemaker.get_execution_role()
session = sagemaker.Session(default_bucket = session_bucket)

print(f'role: {role}')
print(f'bucket: {session_bucket}')
print(f'session region: {session.boto_region_name}')

role: arn:aws:iam::635837196364:role/service-role/AmazonSageMaker-ExecutionRole-20220427T210117
bucket: sagemaker-us-east-1-635837196364
session region: us-east-1


Upload to S3 bucket

In [12]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
s3_prefix = 'dataset/danmu_main'

#save data to S3
train_input_path = f's3://{session.default_bucket()}/{s3_prefix}/train'
#train_dataset.save_to_disk(train_input_path, fs = s3)
session.upload_data(path='data_csv/train.csv', bucket=session_bucket, key_prefix=s3_prefix+'/train')

test_input_path = f's3://{session.default_bucket()}/{s3_prefix}/test'
#test_dataset.save_to_disk(test_input_path, fs = s3)
session.upload_data(path='data_csv/test.csv', bucket=session_bucket, key_prefix=s3_prefix+'/test')

val_input_path = f's3://{session.default_bucket()}/{s3_prefix}/val'
session.upload_data(path='data_csv/val.csv', bucket=session_bucket, key_prefix=s3_prefix+'/val')


's3://sagemaker-us-east-1-635837196364/dataset/danmu_main/val/val.csv'

In [13]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# gets role for executing training job

hyperparameters = {
    'model_name_or_path':'uer/chinese_roberta_L-12_H-768',
    'output_dir':'/opt/ml/model/chinese_roberta',
    # add your remaining hyperparameters 
    # more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/text-classification
    'max_seq_length':128,
    'per_device_train_batch_size' : 64,
    'learning_rate' : 2e-5,
    'num_train_epochs': 5,
    #train
    'do_train': True,
    'do_eval' : True,
    #data
    'train_file': '/opt/ml/input/data/train/train.csv',
    'test_file': '/opt/ml/input/data/train/test.csv',
    'validation_file': '/opt/ml/input/data/val/val.csv',
    #eval
    'evaluation_strategy':"steps", 
    'eval_steps' : 2000,
    'load_best_model_at_end':True,
    'save_steps':2000,
}


In [14]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
    #entry_point='run_glue_t5.py',
    entry_point='run_glue.py',
    #source_dir='./examples/pytorch/text-classification',
    source_dir = './scripts',
    instance_type='ml.g5.xlarge',
    instance_count=1,
    role=role,
    it_config=git_config,
    transformers_version='4.17.0',
    pytorch_version='1.10.2',
    py_version='py38',
    hyperparameters = hyperparameters
)


In [15]:

# starting the train job
huggingface_estimator.fit({'train': train_input_path, 'test': test_input_path, 'val':val_input_path})

2022-05-12 00:36:26 Starting - Starting the training job...
2022-05-12 00:36:42 Starting - Preparing the instances for trainingProfilerReport-1652315786: InProgress
.........
2022-05-12 00:38:27 Downloading - Downloading input data
2022-05-12 00:38:27 Training - Downloading the training image.....................
2022-05-12 00:41:48 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-05-12 00:41:50,649 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-05-12 00:41:50,668 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-05-12 00:41:50,676 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-05-12 00:41:51,231 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2022-05-12-00-36-24-865: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "raise ValueError("--do_predict requires a test dataset")
 ValueError: --do_predict requires a test dataset"
Command "/opt/conda/bin/python3.8 run_glue.py --do_eval True --do_train True --eval_steps 2000 --evaluation_strategy steps --learning_rate 2e-05 --load_best_model_at_end True --max_seq_length 128 --model_name_or_path uer/chinese_roberta_L-12_H-768 --num_train_epochs 5 --output_dir /opt/ml/model/chinese_roberta --per_device_train_batch_size 64 --save_steps 2000 --test_file /opt/ml/input/data/train/test.csv --train_file /opt/ml/input/data/train/train.csv --validation_file /opt/ml/input/data/val/val.csv", exit code: 1