In [2]:
import numpy as np
import pandas as pd
import boto3
import sagemaker
import logging

from sagemaker import RandomCutForest
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from dataclasses import dataclass

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# AWS and model configuration
@dataclass
class AWSConfig:
    bucket: str = 'nequi-data'
    prefix: str = 'sandbox_co/jupadill/sagemaker/rcf/contugas'
    instance_type: str = 'ml.c5.xlarge'
    instance_count: int = 1
    num_samples_per_tree: int = 512
    num_trees: int = 200


class ModelError(Exception):
    pass


# Utility to shingle time-series data
def shingle(data: np.ndarray, shingle_size: int) -> np.ndarray:
    num_samples, num_features = data.shape
    if num_samples <= shingle_size:
        raise ValueError("Data length must be greater than shingle size")
    shingled = np.zeros((num_samples - shingle_size, shingle_size * num_features))
    for i in range(num_samples - shingle_size):
        window = data[i:i + shingle_size]
        shingled[i] = window.flatten()
    return shingled


# RCF model wrapper
class RandomCutForestModel:
    def __init__(self, aws_config: AWSConfig):
        self.config = aws_config
        try:
            self.session = sagemaker.Session()
            self.execution_role = sagemaker.get_execution_role()
        except Exception as e:
            logger.error(f"Failed to init SageMaker session: {e}")
            raise ModelError from e
        self.model = None
        self.endpoint = None

    def initialize_model(self):
        self.model = RandomCutForest(
            role=self.execution_role,
            instance_count=self.config.instance_count,
            instance_type=self.config.instance_type,
            num_samples_per_tree=self.config.num_samples_per_tree,
            num_trees=self.config.num_trees,
            data_location=f"s3://{self.config.bucket}/{self.config.prefix}",
            output_path=f"s3://{self.config.bucket}/{self.config.prefix}/output"
        )

    def train(self, data: np.ndarray):
        if not self.model:
            self.initialize_model()
        logger.info("Training model...")
        self.model.fit(self.model.record_set(data))

    def deploy(self):
        logger.info("Deploying endpoint...")
        self.endpoint = self.model.deploy(
            initial_instance_count=self.config.instance_count,
            instance_type=self.config.instance_type
        )
        self.endpoint.serializer = CSVSerializer()
        self.endpoint.deserializer = JSONDeserializer()

    def predict(self, data: np.ndarray, batch_size: int = 100) -> list:
        if not self.endpoint:
            raise ModelError("Endpoint not deployed")
        scores = []
        for i in range(0, len(data), batch_size):
            batch = data[i:i+batch_size]
            payload = "\n".join(",".join(map(str, row)) for row in batch)
            resp = self.endpoint.predict(payload)
            scores.extend([item['score'] for item in resp['scores']])
        return scores

    def cleanup(self):
        if self.endpoint:
            logger.info("Deleting endpoint...")
            self.endpoint.delete_endpoint()


# Prepare shingled data for any cluster
def prepare_data(df: pd.DataFrame, shingle_size: int, cluster: int) -> tuple[pd.DataFrame, np.ndarray]:
    subset = df[df['Cluster'] == cluster].copy()
    features = subset.iloc[:, 2:5].to_numpy()
    shingled = shingle(features, shingle_size)
    aligned = subset.iloc[shingle_size:].copy()
    return aligned, shingled


# Main processing
def main():
    data = pd.read_csv("./contugas_clean_extra.csv")
    aws_config = AWSConfig()
    results = []

    for cluster_id in sorted(data['Cluster'].unique()):
        logger.info(f"Processing Cluster {cluster_id}")
        aligned_df, shingled_data = prepare_data(data, shingle_size=24, cluster=cluster_id)

        model = RandomCutForestModel(aws_config)
        model.train(shingled_data)
        model.deploy()
        scores = model.predict(shingled_data)
        model.cleanup()

        aligned_df['score'] = scores
        results.append(aligned_df)

    # Combine all clusters
    all_scores_df = pd.concat(results)
    all_scores_df.to_csv("anomaly_scores_all_clusters.csv", index=False)
    logger.info("Exported results to anomaly_scores_all_clusters.csv")


if __name__ == "__main__":
    main()

INFO:__main__:Processing Cluster 0
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-00-51-037


2025-05-20 12:00:56 Starting - Starting the training job...
2025-05-20 12:01:11 Starting - Preparing the instances for training...
2025-05-20 12:01:56 Downloading - Downloading the training image.........
2025-05-20 12:03:22 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 12:03:29 INFO 140669341112128] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metric': '', '_ftp_port': 89

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-12-04-38-592


Training seconds: 155
Billable seconds: 155


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-12-04-38-592
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-12-04-38-592


---------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-12-04-38-592
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-12-04-38-592
INFO:__main__:Processing Cluster 1
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-12-01-333


2025-05-20 12:12:05 Starting - Starting the training job...
2025-05-20 12:12:19 Starting - Preparing the instances for training...
2025-05-20 12:12:54 Downloading - Downloading the training image.........
2025-05-20 12:14:20 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 12:14:27 INFO 140268098164544] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metric': '', '_ftp_port': 899

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-12-15-18-288


Training seconds: 140
Billable seconds: 140


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-12-15-18-288
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-12-15-18-288


---------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-12-15-18-288
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-12-15-18-288
INFO:__main__:Processing Cluster 2
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-23-34-373


2025-05-20 12:23:36 Starting - Starting the training job...
2025-05-20 12:23:51 Starting - Preparing the instances for training...
2025-05-20 12:24:32 Downloading - Downloading the training image............
2025-05-20 12:26:34 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 12:26:40 INFO 139661893678912] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metric': '', '_ftp_port':

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-12-27-51-838


Training seconds: 181
Billable seconds: 181


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-12-27-51-838
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-12-27-51-838


-----------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-12-27-51-838
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-12-27-51-838
INFO:__main__:Processing Cluster 3
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-39-02-398


2025-05-20 12:39:05 Starting - Starting the training job...
2025-05-20 12:39:20 Starting - Preparing the instances for training...
2025-05-20 12:39:59 Downloading - Downloading the training image............
2025-05-20 12:41:51 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 12:42:01 INFO 140014273103680] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_tuning_objective_metric': '', '_ftp_port':

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-12-42-49-601


Training seconds: 165
Billable seconds: 165


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-12-42-49-601
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-12-42-49-601


----------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-12-42-49-601
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-12-42-49-601
INFO:__main__:Processing Cluster 4
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-49-17-256


2025-05-20 12:49:17 Starting - Starting the training job...
2025-05-20 12:49:39 Starting - Preparing the instances for training...
2025-05-20 12:50:21 Downloading - Downloading the training image.........
2025-05-20 12:51:53 Training - Training image download completed. Training in progress.
2025-05-20 12:51:53 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 12:51:45 INFO 139632900433728] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-12-52-34-329


Training seconds: 126
Billable seconds: 126


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-12-52-34-329
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-12-52-34-329


----------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-12-52-34-329
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-12-52-34-329
INFO:__main__:Processing Cluster 5
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-12-58-35-494


2025-05-20 12:58:40 Starting - Starting the training job...
2025-05-20 12:58:55 Starting - Preparing the instances for training...
2025-05-20 12:59:17 Downloading - Downloading input data...
2025-05-20 12:59:42 Downloading - Downloading the training image......
2025-05-20 13:01:08 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 13:01:13 INFO 139635203667776] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus'

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-13-02-23-270


Training seconds: 149
Billable seconds: 149


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-13-02-23-270
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-13-02-23-270


----------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-13-02-23-270
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-13-02-23-270
INFO:__main__:Processing Cluster 6
INFO:__main__:Training model...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-05-20-13-10-09-202


2025-05-20 13:10:09 Starting - Starting the training job...
2025-05-20 13:10:34 Starting - Preparing the instances for training...
2025-05-20 13:10:55 Downloading - Downloading input data...
2025-05-20 13:11:20 Downloading - Downloading the training image......
2025-05-20 13:12:33 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  if cons['type'] is 'ineq':[0m
  if len(self.X_min) is not 0:[0m
[34m[05/20/2025 13:12:39 INFO 140043815479104] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus':

INFO:__main__:Deploying endpoint...
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: randomcutforest-2025-05-20-13-13-26-713


Training seconds: 131
Billable seconds: 131


INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-05-20-13-13-26-713
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-05-20-13-13-26-713


----------!

INFO:__main__:Deleting endpoint...
INFO:sagemaker:Deleting endpoint configuration with name: randomcutforest-2025-05-20-13-13-26-713
INFO:sagemaker:Deleting endpoint with name: randomcutforest-2025-05-20-13-13-26-713
INFO:__main__:Exported results to anomaly_scores_all_clusters.csv


In [6]:
prueba = pd.read_csv('anomaly_scores_with_levels.csv')

In [7]:
prueba

Unnamed: 0,cliente,fecha,presion,temperatura,volumen,Cluster,outlier_multivar,score,level
0,6,2019-01-15 00:00:00,0.518345,-0.028506,-1.334240,0,False,0.740588,Normal
1,6,2019-01-15 01:00:00,0.645290,0.084221,-1.791673,0,False,0.749864,Normal
2,6,2019-01-15 02:00:00,0.704079,-0.718527,-1.791673,0,False,0.746900,Normal
3,6,2019-01-15 03:00:00,0.650755,-0.782854,-1.791673,0,False,0.767017,Normal
4,6,2019-01-15 04:00:00,0.539415,0.069907,-1.286390,0,False,0.767443,Normal
...,...,...,...,...,...,...,...,...,...
870067,12,2023-12-31 19:00:00,-1.095151,0.569951,0.813947,6,False,3.523397,Medium
870068,12,2023-12-31 20:00:00,-1.216134,1.424159,0.934506,6,False,3.492198,Low
870069,12,2023-12-31 21:00:00,-1.326603,0.522323,0.675552,6,False,3.497812,Medium
870070,12,2023-12-31 22:00:00,-1.922934,0.601643,0.795676,6,False,3.487027,Low


In [5]:
import pandas as pd
import numpy as np

# 1) read your previously exported file
df = pd.read_csv("anomaly_scores_all_clusters.csv")

# 2) same transform logic
means = df.groupby('Cluster')['score'].transform('mean')
stds  = df.groupby('Cluster')['score'].transform('std')

conds = [
    df['score'] <=            means + stds,
    df['score'].between(means + stds, means + 2*stds, inclusive='right'),
    df['score'].between(means + 2*stds, means + 3*stds, inclusive='right'),
    df['score'] >            means + 3*stds
]
labels = ['Normal','Low','Medium','High']
df['level'] = np.select(conds, labels, default='Normal')

# 3) write out your new file
df.to_csv("anomaly_scores_with_levels.csv", index=False)