In [1]:
import pandas as pd
import numpy as np
import sagemaker
from sagemaker.estimator import Estimator
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.model_monitor import DataCaptureConfig, ModelQualityMonitor, DatasetFormat, EndpointInput
import matplotlib.pyplot as plt
import joblib
import boto3

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
s3_client = boto3.client('s3')

s3 = boto3.resource('s3')

In [3]:
#import io

#bucket = s3.Bucket('dolar-clau-final')

# Importar desde el bucket de S3
#df_array = []

#for file in bucket.objects.filter():
#    if file.key.endswith('.csv'):
#        s3_response = s3_client.get_object(Bucket='dolar-clau-final', Key=file.key)
#        df_array.append(pd.read_csv(io.BytesIO(s3_response['Body'].read()), index_col=0, dayfirst=True, parse_dates=True))
#ts = pd.concat(df_array, axis=0)

#ts = ts.resample('d').last().dropna().sort_values(by=['fecha'])
#ts

In [4]:
ts = pd.read_csv("dolar.csv", index_col=0, parse_dates=True, dayfirst=True, delimiter=";")
ts = ts.sort_values(by=['fecha'])
ts = ts.loc["2023-10":].copy() #Permitir una cantidad menor de datos por limitación del sagamaker
ts

Unnamed: 0_level_0,precio
fecha,Unnamed: 1_level_1
2023-10-01,4053.76
2023-10-02,4053.76
2023-10-03,4141.43
2023-10-04,4187.01
2023-10-05,4252.09
2023-10-06,4359.4
2023-10-07,4386.66
2023-10-08,4386.66
2023-10-09,4386.66
2023-10-10,4386.66


In [5]:
df = pd.DataFrame()
df["t-0"] = ts["precio"]
df.index = ts.index
df["t-1"] = ts.shift(1)
df["t-2"] = ts.shift(2)
df = df.dropna()

In [6]:
df

Unnamed: 0_level_0,t-0,t-1,t-2
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-03,4141.43,4053.76,4053.76
2023-10-04,4187.01,4141.43,4053.76
2023-10-05,4252.09,4187.01,4141.43
2023-10-06,4359.4,4252.09,4187.01
2023-10-07,4386.66,4359.4,4252.09
2023-10-08,4386.66,4386.66,4359.4
2023-10-09,4386.66,4386.66,4386.66
2023-10-10,4386.66,4386.66,4386.66
2023-10-11,4231.97,4386.66,4386.66
2023-10-12,4212.5,4231.97,4386.66


In [7]:
X = df.drop('t-0', axis=1)
y = df['t-0']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False) #Sin desordenar los datos

In [9]:
train, test = train_test_split(df, test_size=0.3, shuffle=False)

# Save as scv

In [10]:
train.to_csv("train.csv", index=False)

In [11]:
test.to_csv("test.csv", index=False)

# Sagemaker session

In [12]:
session = sagemaker.session.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [13]:
s3_train_dataset_path = sagemaker.s3.S3Uploader.upload("train.csv", f"s3://datasets-clau/dolar")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [14]:
s3_test_dataset_path = sagemaker.s3.S3Uploader.upload("test.csv", f"s3://datasets-clau/dolar")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# Modelo para predecir el precio del dolar

In [15]:
clf = KNeighborsRegressor(n_neighbors=10, weights='distance')
clf.fit(X_train, y_train)

### Probar modelo:

In [16]:
prueba = pd.DataFrame({'fecha':['2023-10-23'], 't-1': [4230.0], 't-2': [4238.9707]})
prueba['fecha'] = pd.to_datetime(prueba['fecha'], format='%Y-%m-%d')
prueba = prueba.set_index('fecha')
prueba = prueba.sort_index()
prueba

Unnamed: 0_level_0,t-1,t-2
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-10-23,4230.0,4238.9707


In [17]:
valor_predict = clf.predict(prueba)
valor_predict

array([4258.02973428])

# Empaquetar el modelo

In [18]:
joblib.dump(clf, 'model.joblib')

['model.joblib']

In [19]:
!tar -czvf model.tar.gz model.joblib

model.joblib


# Upload model to s3

In [20]:
session_bucket = session.default_bucket()

In [21]:
region = 'us-east-1'
role = 'LabRole'

In [22]:
model_path = sagemaker.s3.S3Uploader.upload(
    "model.tar.gz",
    f"s3://{session_bucket}/precio_dolar_model"
)
model_path

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


's3://sagemaker-us-east-1-373452244141/precio_dolar_model/model.tar.gz'

# Create entrypoint

In [23]:
%%writefile entrypoint.py
import joblib
import os
import numpy as np

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

def predict_fn(input_object, model):
    if np.ndim(input_object) == 1:
        input_object = np.expand_dims(input_object, axis = 0)
    y_hay = model.predict(input_object)
    return y_hay

Overwriting entrypoint.py


In [24]:
!python -c "from entrypoint import model_fn; print(model_fn(''));"

KNeighborsRegressor(n_neighbors=10, weights='distance')


In [25]:
!python -c "from entrypoint import predict_fn; import pandas as pd; \
            import joblib; \
            model = joblib.load('model.joblib'); \
            print(predict_fn(pd.DataFrame.from_dict({'t-1': [4230.0], 't-2': [4238.9707]}), model));"

[4258.02973428]


In [26]:
!python -c "from entrypoint import predict_fn; import pandas as pd; \
            import joblib; \
            model = joblib.load('model.joblib'); \
            prueba = pd.DataFrame({'fecha':['2023-10-23'], 't-1': [4230.0], 't-2': [4238.9707]}); \
            prueba['fecha'] = pd.to_datetime(prueba['fecha'], format='%Y-%m-%d'); \
            prueba = prueba.set_index('fecha'); \
            prueba = prueba.sort_index(); \
            print(predict_fn(prueba, model));"

[4258.02973428]


# Create Baseline

In [55]:
clf = joblib.load("model.joblib")

In [56]:
df_val = pd.read_csv("test.csv")

In [57]:
X_val = df_val.iloc[:,1:]
X_val

Unnamed: 0,t-1,t-2
0,4249.0,4249.0
1,4222.09,4249.0
2,4227.39,4222.09
3,4249.71,4227.39
4,4238.85,4249.71
5,4238.85,4238.85
6,4238.85,4238.85


In [58]:
y_val = df_val['t-0']
y_val

0    4222.09
1    4227.39
2    4249.71
3    4238.85
4    4238.85
5    4238.85
6    4221.39
Name: t-0, dtype: float64

In [59]:
y_val_predict = clf.predict(X_val)

In [60]:
df = pd.DataFrame()

In [61]:
df["probability"] = 1.0
df["prediction"] = y_val_predict
df["label"] = y_val

In [63]:
df

Unnamed: 0,probability,prediction,label
0,1.0,4249.0,4222.09
1,1.0,4258.398289,4227.39
2,1.0,4255.620841,4249.71
3,1.0,4256.6953,4238.85
4,1.0,4257.084598,4238.85
5,1.0,4257.986476,4238.85
6,1.0,4257.986476,4221.39


In [64]:
df.to_csv("validation_with_prediction.csv", index=False)

In [65]:
validation_dataset_uri = sagemaker.s3.S3Uploader.upload(
    "validation_with_prediction.csv",
    f"s3://{session_bucket}/dolar/validation_dataset"
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [66]:
validation_dataset_uri

's3://sagemaker-us-east-1-373452244141/dolar/validation_dataset/validation_with_prediction.csv'

In [39]:
churn_model_quality_monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=session,
)

In [40]:
job = churn_model_quality_monitor.suggest_baseline(
    baseline_dataset=validation_dataset_uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=f"s3://{session_bucket}/dolar/baseline",
    problem_type="MulticlassClassification", #BinaryClassification
    inference_attribute="prediction",
    probability_attribute="probability",
    ground_truth_attribute="label",
)
job.wait(logs=False)

INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2023-10-24-20-55-14-497


..........................................................................!

# Create DataCapture Object

In [41]:
s3_capture_upload_path = f"s3://{session_bucket}/dolar/data_capture"

In [42]:
data_capture_config = DataCaptureConfig(
    enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [43]:
print(s3_capture_upload_path)

s3://sagemaker-us-east-1-373452244141/dolar/data_capture


In [44]:
model = SKLearnModel(
    model_data=model_path,
    entry_point='entrypoint.py',
    framework_version='1.2-1',
    role=role
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [45]:
predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
    data_capture_config=data_capture_config
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-10-24-21-01-33-713
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2023-10-24-21-01-34-433
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2023-10-24-21-01-34-433


----!

In [46]:
predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

In [47]:
prueba = pd.DataFrame({'fecha':['2023-10-23'], 't-1': [4230.0], 't-2': [4238.9707]})
prueba['fecha'] = pd.to_datetime(prueba['fecha'], format='%Y-%m-%d')
prueba = prueba.set_index('fecha')
prueba = prueba.sort_index()
prueba

Unnamed: 0_level_0,t-1,t-2
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-10-23,4230.0,4238.9707


In [48]:
predictor.predict(prueba)

[['4258.029734321028']]

In [67]:
endpointInput = EndpointInput(
    endpoint_name=predictor.endpoint_name,
    probability_attribute="0",
    probability_threshold_attribute=0.5,
    destination="/opt/ml/processing/input_data",
)

In [69]:
from sagemaker.model_monitor import CronExpressionGenerator

response = churn_model_quality_monitor.create_monitoring_schedule(
    endpoint_input=endpointInput,
    output_s3_uri=f"s3://{session_bucket}/dolar/results",
    problem_type="BinaryClassification", #MulticlassClassification
    ground_truth_input=f"s3://{session_bucket}/dolar/ground_truth/",
    constraints=f"s3://{session_bucket}/dolar/baseline/constraints.json",
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: monitoring-schedule-2023-10-24-21-18-42-197
