# Regresión Lineal con Sagemaker

In [None]:
import pandas as pd
import sklearn

In [None]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = 'module_4/part_1'

print(role)
print(sess)
print(region)
print(bucket)
print(prefix)

#### Preparación de los datos

- https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
- *Many Amazon SageMaker algorithms support training with data in CSV format. To use data in CSV format for training, in the input data channel specification, specify text/csv as the ContentType. Amazon SageMaker requires that a CSV file does not have a header record and that the target variable is in the first column.*


In [None]:
from sklearn import datasets
boston = datasets.load_boston()

x = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])
y = pd.Series(boston['target'], name = 'MEDV')

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

In [None]:
print(x_train.shape, x_test.shape, x_val.shape)
print(y_train.shape, y_test.shape, y_val.shape)

In [None]:
train = pd.concat([y_train, x_train], axis=1)
validation = pd.concat([y_val, x_val], axis=1)
test = pd.concat([y_test, x_test], axis=1)

In [None]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [None]:
sess.upload_data(path='train.csv', bucket=bucket, key_prefix=f'{prefix}/data')

In [None]:
sess.upload_data(path='validation.csv', bucket=bucket, key_prefix=f'{prefix}/data')

#### Entrenamiento del modelo linear learner
- https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html

In [None]:
image = sagemaker.image_uris.retrieve(region=region, framework="linear-learner")
print(image)

In [None]:
s3_train_data = f's3://{bucket}/{prefix}/data/train.csv'
s3_validation_data = f's3://{bucket}/{prefix}/data/validation.csv'

print(s3_train_data)
print(s3_validation_data)


In [None]:
train_input = sagemaker.TrainingInput(
    s3_train_data, 
    content_type="text/csv",
)
validation_input = sagemaker.TrainingInput(
    s3_validation_data,
    content_type="text/csv",
)

data_channels = {
    'train': train_input, 
    'validation': validation_input
}

In [None]:
s3_output_location = f's3://{bucket}/{prefix}/output'

linear = sagemaker.estimator.Estimator(
    image_uri=image,
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path=s3_output_location,
    sagemaker_session=sess,
)

#### Seleccionamos los hyperparámetros
- https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html

In [None]:
linear.set_hyperparameters(predictor_type="regressor", mini_batch_size=50)

In [None]:
linear.fit(
    inputs=data_channels,
    logs=True
)