In [28]:
import numpy as np
import boto3
import sagemaker
import io
import sagemaker.amazon.common as smac
import os
import pandas as pd
sagemaker_session = sagemaker.Session()

In [33]:
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
s3_prefix = 'aws-machine-learning-specialty/algorithms/linear-lerner/regression'
s3_bucket = sagemaker_session.default_bucket()

### Descargamos los Dataset

Descargamos desde UCI el dataset de Bike Sharing

https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip

In [9]:
!wget -P . $dataset_url 
local_filename = dataset_url.split('/')[-1]
!unzip $local_filename  
!rm $local_filename

--2021-01-30 19:19:06--  https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279992 (273K) [application/x-httpd-php]
Saving to: ‘./Bike-Sharing-Dataset.zip’


2021-01-30 19:19:07 (999 KB/s) - ‘./Bike-Sharing-Dataset.zip’ saved [279992/279992]

Archive:  Bike-Sharing-Dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [12]:
!cat Readme.txt

Bike Sharing Dataset

Hadi Fanaee-T

Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto
INESC Porto, Campus da FEUP
Rua Dr. Roberto Frias, 378
4200 - 465 Porto, Portugal


Background 

Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return 
back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return 
back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of 
over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic, 
environmental and health issues. 

Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by
these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration
of tra

In [13]:
!head day.csv

instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600
6,2011-01-06,1,0,1,0,4,1,1,0.204348,0.233209,0.518261,0.0895652,88,1518,1606
7,2011-01-07,1,0,1,0,5,1,2,0.196522,0.208839,0.498696,0.168726,148,1362,1510
8,2011-01-08,1,0,1,0,6,0,2,0.165,0.162254,0.535833,0.266804,68,891,959
9,2011-01-09,1,0,1,0,0,0,1,0.138333,0.116175,0.434167,0.36195,54,768,822


### Convertir en Pandas Dataframe y Preparar

In [15]:
dataset = pd.read_csv('day.csv')
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


Un poco de feature Engineering

In [16]:
dataset['dteday'] = dataset['dteday'].str.replace("-","")
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,20110101,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,20110102,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,20110103,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,20110104,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,20110105,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


Randomize y Split

In [17]:
train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset))])
print(train_data.shape, test_data.shape)

(511, 16) (220, 16)


Separar las features y Labels

In [26]:
feature_dataset = train_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered' ]]
features = np.array(feature_dataset.values).astype('float32')

label_dataset= train_data[['cnt']]
labels = np.array(label_dataset.values).astype('float32')
labels_vec = np.squeeze(np.asarray(labels))      #Remueve los array de 1d y deja los valores.

Prepara Protobuf IO

In [35]:
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, features, labels_vec)
buffer.seek(0)

boto3.resource('s3').Bucket(s3_bucket).Object(os.path.join(s3_prefix, 'train')).upload_fileobj(buffer)
s3_training_data_location = 's3://{}/{}/train'.format(s3_bucket, s3_prefix)
print('training dataset will be uploaded to: {}'.format(s3_training_data_location))

training dataset will be uploaded to: s3://sagemaker-us-east-1-844626608976/aws-machine-learning-specialty/algorithms/linear-lerner/regression/train


In [36]:
output_location = 's3://{}/{}/output'.format(s3_bucket, s3_prefix)
print('model artifacts will be uploaded to: {}'.format(output_location))

model artifacts will be uploaded to: s3://sagemaker-us-east-1-844626608976/aws-machine-learning-specialty/algorithms/linear-lerner/regression/output


### Obtiene training image

In [40]:
linear_container = sagemaker.image_uris.retrieve('linear-learner', boto3.Session().region_name)
linear_container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'

### Entrena el modelo

In [55]:
role = sagemaker.get_execution_role()

# Role, Instance type / count, output location, session
linear = sagemaker.estimator.Estimator(linear_container,
                                       role=role, 
                                       instance_count=1, 
                                       instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)


# Provide the number of features identified during data preparation
# Provide the predictor_type 

linear.set_hyperparameters(feature_dim=len(feature_dataset.columns) , #15
                           mini_batch_size=4,
                           predictor_type='regressor')

#entrena el modelo usando la data de training
linear.fit({'train': s3_training_data_location})

2021-01-30 21:07:07 Starting - Starting the training job...
2021-01-30 21:07:30 Starting - Launching requested ML instancesProfilerReport-1612040827: InProgress
......
2021-01-30 21:08:31 Starting - Preparing the instances for training.........
2021-01-30 21:10:08 Downloading - Downloading input data
2021-01-30 21:10:08 Training - Downloading the training image...
2021-01-30 21:10:32 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/30/2021 21:10:33 INFO 140403057502016] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_sca

***(en caso de no tener el objeto Estimator, podemos ir al training job y crearlo desde ahí)***

In [60]:
job_name = 'linear-learner-2021-01-30-21-07-07-520'
linear_from_job = sagemaker.estimator.Estimator.attach(job_name,sagemaker_session=sagemaker_session)
linear, linear_from_job


2021-01-30 21:11:33 Starting - Preparing the instances for training
2021-01-30 21:11:33 Downloading - Downloading input data
2021-01-30 21:11:33 Training - Training image download completed. Training in progress.
2021-01-30 21:11:33 Uploading - Uploading generated training model
2021-01-30 21:11:33 Completed - Training job completed


(<sagemaker.estimator.Estimator at 0x7f79f3c73cf8>,
 <sagemaker.estimator.Estimator at 0x7f79ef349c18>)

### Realizando predicciones

In [61]:
# Deploy the model
linear_predictor = linear_from_job.deploy(initial_instance_count=1,instance_type='ml.c4.xlarge',endpoint_name='bikeshare-sagemaker-regression-v1')

-----------------!

***(en caso de no tener el objeto Predictor, podemos ir al endpoint name y crearlo desde ahí)***

In [132]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer

linear_predictor_from_endpoint  =  sagemaker.predictor.Predictor(
    'bikeshare-sagemaker-regression-v1',
    sagemaker_session=sagemaker_session,
    serializer= CSVSerializer(),
    deserializer=JSONDeserializer()
)
test_feature_dataset = test_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered']]

test_actuals = np.array(test_data['cnt'].astype('float32'))
test_features = np.array(test_feature_dataset.values).astype('float32').tolist()

predictions = []
actuals = []
for tf, actual in zip(test_features, test_actuals):
    #print(",".join(map(str, tf)))
    prediction = linear_predictor_from_endpoint.predict(data=tf, initial_args= {'Accept':'text/json'})
    predictions.append(prediction['predictions'][0]['score'])
    actuals.append(actual)
    

predictions_df = pd.DataFrame({'Prediction': predictions, 'Actual': actuals})

In [133]:
predictions_df['delta'] = predictions_df['Prediction']-predictions_df['Actual']

In [135]:
predictions_df.describe()

Unnamed: 0,Prediction,Actual,delta
count,220.0,220.0,220.0
mean,4454.957031,4455.009091,-0.05206
std,1833.343223,1833.758388,1.290251
min,706.390625,705.0,-2.828125
25%,3192.625,3193.0,-0.96875
50%,4387.257812,4387.5,-0.085938
75%,5677.617188,5675.5,0.828125
max,8394.765625,8395.0,4.296875


In [137]:
# Get accuracy using Cosine Similarity method 
from numpy import dot
from numpy.linalg import norm
tolerance = 1e-10
accuracy = (dot(actuals, predictions)/(norm(actuals)*norm(predictions))) * 100
print('accuracy: ', accuracy)

accuracy:  100.0000031367257


Esto significa que el modelo es muy exacto (obvio no puede ser mayor a 100% pero es muy acertado)

### Borrar el Endpoint y Modelo

In [139]:
linear_predictor_from_endpoint.delete_endpoint()
linear_predictor_from_endpoint.delete_model()