In [177]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [178]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def convert_to_array(df, column):
    df[column] = df[column].apply(lambda x: np.fromstring(x.strip("[]"), sep=' '))

def split_columns(df, column):
    for i in range(3):
        df[f'{column}_{i+1}'] = df[column].apply(lambda x: x[i])

def process_data_head(df):
    for col in ['Head Position', 'Head Forward', 'Head Up']:
        convert_to_array(df, col)
        split_columns(df, col)
        df.drop(columns=[col], inplace=True)

def process_data_wrist(df):
    for col in ['Wrist position', 'Wrist orientation', 'Wrist radius']:
        convert_to_array(df, col)
        split_columns(df, col)
        df.drop(columns=[col], inplace=True)

In [179]:
head_data_Yang = load_data('Head_data/head_data_Yang.csv')
head_data_Gao = load_data('Head_data/head_data_Gao.csv')
head_data_Li = load_data('Head_data/Head_data_Li.csv')
head_data_u1 = load_data('Head_data/Head_data_u1_1.csv')
head_data_u2 = load_data('Head_data/Head_data_u2_1.csv')
head_data_u3 = load_data('Head_data/Head_data_u3_1.csv')
head_data_u4 = load_data('Head_data/Head_data_u4_1.csv')
head_data_u5 = load_data('Head_data/Head_data_u5_1.csv')
process_data_head(head_data_Yang)
process_data_head(head_data_Gao)
process_data_head(head_data_Li)
process_data_head(head_data_u1)
process_data_head(head_data_u2)
process_data_head(head_data_u3)
process_data_head(head_data_u4)
process_data_head(head_data_u5)
head_data_Yang['label'] = 0
head_data_Gao['label'] = 1
head_data_Li['label'] = 2
head_data_u1['label'] = 3
head_data_u2['label'] = 4
head_data_u3['label'] = 5
head_data_u4['label'] = 6
head_data_u5['label'] = 7
combined_data_head = pd.concat([head_data_Yang, head_data_Gao, head_data_Li, head_data_u1, head_data_u2, head_data_u3, head_data_u4, head_data_u5], ignore_index=True)

In [180]:
train_head, test_head = train_test_split(combined_data_head, test_size=0.2, random_state=42)

In [181]:
train_head.iloc[:, :-1] = train_head.iloc[:, :-1].astype('float32')
train_head.iloc[:, -1] = train_head.iloc[:, -1].astype(int)
test_head.iloc[:, :-1] = test_head.iloc[:, :-1].astype('float32')
test_head.iloc[:, -1] = test_head.iloc[:, -1].astype(int)
train_head.to_csv('data_head/train_head.csv', index=False, header=False)
test_head.to_csv('data_head/test_head.csv', index=False, header=False)

##upload data to s3
s3 = boto3.client('s3')
bucket_name = 'pretraindata'
s3.upload_file('data_head/train_head.csv', bucket_name, 'data_head/train_head.csv')
s3.upload_file('data_head/test_head.csv', bucket_name, 'data_head/test_head.csv')

In [191]:
import sagemaker
from sagemaker.amazon.amazon_estimator import RecordSet
from sagemaker.session import Session

# 获取角色
role = sagemaker.get_execution_role()

# 创建LinearLearner对象
multiclass_estimator = sagemaker.LinearLearner(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    predictor_type="multiclass_classifier",
    num_classes=8,  # 确保这个值与标签中的分类数一致
)

# 将数据包装在RecordSet对象中 
session = Session()

# 读取训练数据
s3_train_path = 's3://pretraindata/data_head/train_head.csv'
train_data = pd.read_csv(s3_train_path)
train_features = train_data.iloc[:, :-1].values.astype(np.float32)
train_labels = train_data.iloc[:, -1].values.astype(np.float32)
train_data_processed = pd.DataFrame(train_features)
train_data_processed['label'] = train_labels
print(train_data_processed.dtypes)

0        float32
1        float32
2        float32
3        float32
4        float32
5        float32
6        float32
7        float32
8        float32
label    float32
dtype: object


In [192]:

# 创建RecordSet对象
train_records = multiclass_estimator.record_set(train_features, labels=train_labels, channel="train")

# 启动训练任务
multiclass_estimator.fit([train_records])

2024-05-31 07:37:07 Starting - Starting the training job...
2024-05-31 07:37:22 Starting - Preparing the instances for training...
2024-05-31 07:37:47 Downloading - Downloading input data...
2024-05-31 07:38:16 Downloading - Downloading the training image......
2024-05-31 07:39:27 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/31/2024 07:39:50 INFO 140266414565184] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optim

In [193]:
# 部署模型
multiclass_predictor = multiclass_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge"
)

-------!