# Get Data Ready

In [1]:
#!pip install gdown

In [2]:
#https://drive.google.com/file/d/1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap/view?usp=sharing
!gdown --id 1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap
#https://drive.google.com/file/d/1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl/view?usp=sharing
!gdown --id 1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl

Downloading...
From: https://drive.google.com/uc?id=1hTp8qDF4r2tTk7W6SBnfYyRoNwwSdkap
To: /home/ec2-user/SageMaker/danmu_token_main.pkl
100%|███████████████████████████████████████| 5.38M/5.38M [00:00<00:00, 143MB/s]
Downloading...
From: https://drive.google.com/uc?id=1T-yZGWyPC89SCCbm1m0DrRMGyDlUaonl
To: /home/ec2-user/SageMaker/danmu_dist_main.pkl
100%|███████████████████████████████████████| 56.7M/56.7M [00:00<00:00, 252MB/s]


In [3]:
import pickle

token_file = open("danmu_token_main.pkl", "rb")
dist_file = open("danmu_dist_main.pkl", "rb")

tokens = pickle.load(token_file)
dists = pickle.load(dist_file)

print(len(tokens))
print(len(dists))

337392
337392


In [4]:
import numpy as np
labels = [np.argmax(i) for i in dists]

label_distribution = {}
for i in labels:
    if i in label_distribution:
        label_distribution[i] += 1
    else:
        label_distribution[i] = 1

In [5]:
len(label_distribution)

19

In [6]:
# remove label with no data
label_list = list(label_distribution.keys())
labels = [label_list.index(i) for i in labels]

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
    'text': tokens,
    'label': labels
})

train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.1)

In [8]:
#Debugging
train_df = train_df[train_df['text'] != 'nan']
test_df = test_df[test_df['text'] != 'nan']
val_df = val_df[val_df['text'] != 'nan']


In [9]:
train_df.reset_index(drop=True)
test_df.reset_index(drop=True)
val_df.reset_index(drop=True)

Unnamed: 0,text,label
0,自吸换,13
1,再见了朋友,0
2,哈哈蔡依林,3
3,一波盲,9
4,这中单,2
...,...,...
26987,逍遥散,2
26988,50g冲浪,12
26989,发条茉莉,4
26990,glasgow,6


In [10]:
#save to csv

train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

train_df.to_csv('data_csv/train.csv', index = False)
test_df.to_csv('data_csv/test.csv', index = False)
val_df.to_csv('data_csv/val.csv', index = False)

# train_dict = train_df.to_dict()
# test_dict = test_df.to_dict()
# val_dict = val_df.to_dict()
# train_data_json = {
#     'train' : train_dict,
#     'test' : test_dict,
#     'validation': val_dict
# }

# import json

# with open('data_csv/data.json', 'w') as f:
#     json.dump(train_data_json, f)
    

# Preprocessing

In [11]:
import datasets
train = datasets.Dataset.from_pandas(train_df, preserve_index=False)
val = datasets.Dataset.from_pandas(val_df, preserve_index=False)
test = datasets.Dataset.from_pandas(test_df, preserve_index=False)

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
#Tokenization

tokenized_train =  train.map(preprocess_function, batched=True)
tokenized_val =  val.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

  0%|          | 0/243 [00:00<?, ?ba/s]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/68 [00:00<?, ?ba/s]

In [14]:
# For batching and padding
import transformers 

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
#!conda install pytorch torchvision torchaudio cpuonly -c pytorch

# Model

In [16]:
# model = transformers.AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-jd-full-chinese",\
#                                                                         num_labels = len(label_distribution), \
#                                                                        ignore_mismatched_sizes=True)
model = transformers.AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels = len(label_distribution))

training_args = transformers.TrainingArguments(
    output_dir="chinese_base",
    
)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
#metrics

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [18]:
trainer = transformers.Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_val,
    data_collator=data_collator,
    tokenizer = tokenizer,
)

In [19]:
#trainer.train()

# SageMaker

In [1]:
import sagemaker

session = sagemaker.Session()

session_bucket = None
if session_bucket == None and session is not None:
    session_bucket = session.default_bucket()
    
role = sagemaker.get_execution_role()
session = sagemaker.Session(default_bucket = session_bucket)

print(f'role: {role}')
print(f'bucket: {session_bucket}')
print(f'session region: {session.boto_region_name}')

role: arn:aws:iam::635837196364:role/service-role/AmazonSageMaker-ExecutionRole-20220511T210233
bucket: sagemaker-us-east-1-635837196364
session region: us-east-1


Upload to S3 bucket

In [2]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
s3_prefix = 'dataset/danmu_main'

#save data to S3
train_input_path = f's3://{session.default_bucket()}/{s3_prefix}/train'
#train_dataset.save_to_disk(train_input_path, fs = s3)
session.upload_data(path='data_csv/train.csv', bucket=session_bucket, key_prefix=s3_prefix+'/train')

test_input_path = f's3://{session.default_bucket()}/{s3_prefix}/test'
#test_dataset.save_to_disk(test_input_path, fs = s3)
session.upload_data(path='data_csv/test.csv', bucket=session_bucket, key_prefix=s3_prefix+'/test')

val_input_path = f's3://{session.default_bucket()}/{s3_prefix}/val'
session.upload_data(path='data_csv/val.csv', bucket=session_bucket, key_prefix=s3_prefix+'/val')


's3://sagemaker-us-east-1-635837196364/dataset/danmu_main/val/val.csv'

In [4]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# gets role for executing training job

hyperparameters = {
    'model_name_or_path':'bert-base-chinese',
    'output_dir':'/opt/ml/model/bert_base_chinese',
    # add your remaining hyperparameters 
    # more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/text-classification
    'max_seq_length':128,
    'per_device_train_batch_size' : 24,
    'learning_rate' : 2e-5,
    'num_train_epochs': 5,
    #train
    'do_train': True,
    #'do_eval' : True,
    #data
    'train_file': '/opt/ml/input/data/train/train.csv',
    #'test_file': '/opt/ml/input/data/train/test.csv',
    'validation_file': '/opt/ml/input/data/val/val.csv',
}

In [6]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
    #entry_point='run_glue_t5.py',
    entry_point='run_glue.py',
    #source_dir='./examples/pytorch/text-classification',
    source_dir = './scripts',
    instance_type='ml.g5.xlarge',
    instance_count=1,
    role=role,
    #it_config=git_config,
    transformers_version='4.17.0',
    pytorch_version='1.10.2',
    py_version='py38',
    hyperparameters = hyperparameters
)


In [None]:

# starting the train job
huggingface_estimator.fit({'train': train_input_path, 'test': test_input_path, 'val':val_input_path})

2022-05-12 02:25:21 Starting - Starting the training job...
2022-05-12 02:25:44 Starting - Preparing the instances for trainingProfilerReport-1652322320: InProgress
......
2022-05-12 02:26:50 Downloading - Downloading input data...
2022-05-12 02:27:10 Training - Downloading the training image...........................
2022-05-12 02:31:46 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-05-12 02:31:39,620 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-05-12 02:31:39,640 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-05-12 02:31:39,646 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-05-12 02:31:40,191 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining En

In [9]:
#attach back to trainning job if needed:
import sagemaker
import boto3

need_attach = False
session = sagemaker.Session()
if need_attach:
    output = sagemaker.estimator.Estimator.attach('huggingface-pytorch-training-2022-05-12-02-25-20-722', sagemaker_session = session)
    print(output)

<sagemaker.session.Session at 0x7f532b294048>

# Eval

In [None]:
import transformers
from sagemaker.huggingface import HuggingFace
from sagemaker.huggingface.model import HuggingFaceModel

import_from_s3 = False

if import_from_s3:
    #To restore old trainning job
    # create Hugging Face Model Class
    huggingface_model = HuggingFaceModel(
        model_data="s3://sagemaker-us-east-1-635837196364/huggingface-pytorch-training-2022-05-12-00-16-12-039/output/model.tar.gz",  # path to your trained SageMaker model
        role=role,                                            # IAM role with permissions to create an endpoint
        transformers_version='4.17.0',                        # Transformers version used
        pytorch_version='1.10.2',                             # PyTorch version used
        py_version='py38',                                # Python version used
        env = {
            'HF_MODEL_ID':'uer/chinese_roberta_L-12_H-768',
            'HF_TASK': 'text-classification',

        },
    )
    # deploy model to SageMaker Inference
    predictor = huggingface_model.deploy(
       initial_instance_count=1,
       instance_type="ml.m5.4xlarge"
    )
else:
    predictor = huggingface_estimator.deploy(1, "ml.m5.4xlarge")

In [None]:
input = ["input": '你好呀']

predictor.predict(input)

In [None]:
#predictor.delete_endpoint()