# Regresión Lineal con Sagemaker

In [30]:
import pandas as pd
import sklearn

In [4]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = 'module_4/part_1'

print(role)
print(sess)
print(region)
print(bucket)
print(prefix)

arn:aws:iam::467432373215:role/service-role/AmazonSageMaker-ExecutionRole-20221206T164397
<sagemaker.session.Session object at 0x7fa850146990>
eu-west-1
sagemaker-eu-west-1-467432373215
module_4/part_1


#### Preparación de los datos

- https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
- *Many Amazon SageMaker algorithms support training with data in CSV format. To use data in CSV format for training, in the input data channel specification, specify text/csv as the ContentType. Amazon SageMaker requires that a CSV file does not have a header record and that the target variable is in the first column.*


In [29]:
boston = sklearn.datasets.load_boston()

x = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])
y = pd.Series(boston['target'], name = 'MEDV')

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

In [16]:
print(x_train.shape, x_test.shape, x_val.shape)
print(y_train.shape, y_test.shape, y_val.shape)

(303, 13) (102, 13) (101, 13)
(303,) (102,) (101,)


In [18]:
train = pd.concat([y_train, x_train], axis=1)
validation = pd.concat([y_val, x_val], axis=1)
test = pd.concat([y_test, x_test], axis=1)

In [19]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [20]:
sess.upload_data(path='train.csv', bucket=bucket, key_prefix=f'{prefix}/data')

's3://sagemaker-eu-west-1-467432373215/module_4/part_1/data/train.csv'

In [21]:
sess.upload_data(path='validation.csv', bucket=bucket, key_prefix=f'{prefix}/data')

's3://sagemaker-eu-west-1-467432373215/module_4/part_1/data/validation.csv'

#### Entrenamiento del modelo linear learner
- https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html

In [22]:
image = sagemaker.image_uris.retrieve(region=region, framework="linear-learner")
print(image)

438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:1


In [23]:
s3_train_data = f's3://{bucket}/{prefix}/data/train.csv'
s3_validation_data = f's3://{bucket}/{prefix}/data/validation.csv'

print(s3_train_data)
print(s3_validation_data)


s3://sagemaker-eu-west-1-467432373215/module_4/part_1/data/train.csv
s3://sagemaker-eu-west-1-467432373215/module_4/part_1/data/validation.csv


In [24]:
train_input = sagemaker.TrainingInput(
    s3_train_data, 
    content_type="text/csv",
)
validation_input = sagemaker.TrainingInput(
    s3_validation_data,
    content_type="text/csv",
)

data_channels = {'train': train_input, 
                 'validation': validation_input}

In [26]:
s3_output_location = f's3://{bucket}/{prefix}/output'

linear = sagemaker.estimator.Estimator(
    image_uri=image,
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path=s3_output_location,
    sagemaker_session=sess,
)

#### Seleccionamos los hyperparámetros
- https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html

In [27]:
linear.set_hyperparameters(predictor_type="regressor", mini_batch_size=50)

In [28]:
linear.fit(
    inputs=data_channels,
    logs=True
)

2022-12-12 10:31:08 Starting - Starting the training job...
2022-12-12 10:31:34 Starting - Preparing the instances for trainingProfilerReport-1670841068: InProgress
...............
2022-12-12 10:33:57 Downloading - Downloading input data...
2022-12-12 10:34:33 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/12/2022 10:34:31 INFO 139718813923136] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss':

In [None]:
s3 = boto3.client("s3")

filename = "wdbc.csv"
s3.download_file("sagemaker-sample-files", "datasets/tabular/breast_cancer/wdbc.csv", filename)
data = pd.read_csv(filename, header=None)

In [None]:
# specify columns extracted from wbdc.names
data.columns = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
]

# save the data
data.to_csv("data.csv", sep=",", index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis
display(data.diagnosis.value_counts())