# What we will learn

1. S3 Bucket - boto3
2. Iam roles and users
3. Complete Infrastrusture of AWS Sagemaker - Training, Endpoint

In [2]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

In [3]:
sm_boto3 = boto3.client("sagemaker")
session = sagemaker.Session()
region = session.boto_session.region_name
bucket = "mobbucketsagemaker2404"
print("Using bucket " + bucket)

[2;36m[04/28/25 17:21:50][0m[2;36m [0m[1;94mINFO    [0m Found credentials in shared    [2mcredentials.py[0m[2m:[0m[2m1352[0m
[2;36m                    [0m         credentials file:              [2m                   [0m
[2;36m                    [0m         ~[95m/.aws/[0m[95mcredentials[0m             [2m                   [0m
Using bucket mobbucketsagemaker2404


In [4]:
print(region)

us-east-1


In [5]:
df = pd.read_csv("mob_price_classification_train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
df.shape

(2000, 21)

In [7]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [8]:
df["price_range"].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [9]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [10]:
X = df[features[:-1]]
y = df["price_range"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

In [12]:
trainX = pd.DataFrame(X_train)
trainX["price_range"] = y_train

testX = pd.DataFrame(X_test)
testX["price_range"] = y_test

In [13]:
trainX.to_csv("train-V-1.csv", index = False)
testX.to_csv("test-V-1.csv", index = False)

In [14]:
bucket

'mobbucketsagemaker2404'

In [15]:
## send data to s3
## Sagemaker will take the data for training from s3

In [16]:
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = session.upload_data(path = "train-V-1.csv", bucket = bucket, key_prefix = sk_prefix)

testpath = session.upload_data(path = "test-V-1.csv", bucket = bucket, key_prefix = sk_prefix)

print(trainpath)
print(testpath)

s3://mobbucketsagemaker2404/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://mobbucketsagemaker2404/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


#### Script used by AWS Sagemaker to Train Models 

In [17]:
%%writefile script.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))

if __name__=="__main__":
    print("[Info] Extracting arguments")
    parser = argparse.ArgumentParser()
    
    ## Hyperparameter
    parser.add_argument("--n_estimators", type = int, default = 100)
    parser.add_argument("--random_state", type = int, default = 0)

    ## Data, model and output directories
    parser.add_argument("--model-dir", type = str, default = os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type = str, default = os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type = str, default = os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type = str, default = "train-V-1.csv")
    parser.add_argument("--test-file", type = str, default = "test-V-1.csv")

    args, _ = parser.parse_known_args()

    print("Sklearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print("Building training and testing datasets")
    print()

    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("Column order: ")
    print(features)
    print()

    print("Label column is: ", label)
    print()

    print("Data shape: ")
    print()
    print("-------------Shape of training data (85%) -----------")
    print(X_train.shape)
    print(y_train.shape)
    print()

    print("Training RandomForest Model .....")
    print()
    model = RandomForestClassifier(n_estimators = args.n_estimators, random_state = args.random_state, verbose = 2, n_jobs = 1)

    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at ", model_path)

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("------Metics Results from testing data----------")
    print()
    print("Total rows are ", X_test.shape[0])
    print("[Testing] Model accuracy is: ", test_acc)
    print("[Testing] testing report: ")
    print(test_rep)

Overwriting script.py


### AWS Sagemaker Entry Point to execute the training script

In [18]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point = "script.py",
    role = "arn:aws:iam::221082216147:role/sagameker_access",
    instance_count = 1,
    instance_type = "ml.m5.large",
    framework_version = FRAMEWORK_VERSION,
    base_job_name = "RF-suctom-sklearn",
    hyperparameters = {
        "n_estimators": 100,
        "random_state": 0
    },
    use_spot_instance = True,
    max_run = 3600
)

In [19]:
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait = True)

[2;36m[04/28/25 17:21:52][0m[2;36m [0m[1;94mINFO    [0m SageMaker Python SDK will  [2mtelemetry_logging.py[0m[2m:[0m[2m91[0m
[2;36m                    [0m         collect telemetry to help  [2m                       [0m
[2;36m                    [0m         us better understand our   [2m                       [0m
[2;36m                    [0m         user's needs, diagnose     [2m                       [0m
[2;36m                    [0m         issues, and deliver        [2m                       [0m
[2;36m                    [0m         additional features.       [2m                       [0m
[2;36m                    [0m         To opt out of telemetry,   [2m                       [0m
[2;36m                    [0m         please disable via         [2m                       [0m
[2;36m                    [0m         TelemetryOptOut parameter  [2m                       [0m
[2;36m                    [0m         in SDK defaults config.    [2m

### To get the model from s3

In [20]:
sklearn_estimator.latest_training_job.wait(logs = "None")
artifacts = sm_boto3.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]


2025-04-28 15:24:05 Starting - Preparing the instances for training
2025-04-28 15:24:05 Downloading - Downloading the training image
2025-04-28 15:24:05 Training - Training image download completed. Training in progress.
2025-04-28 15:24:05 Uploading - Uploading generated training model
2025-04-28 15:24:05 Completed - Training job completed


In [21]:
artifacts

's3://sagemaker-us-east-1-221082216147/RF-suctom-sklearn-2025-04-28-15-21-52-364/output/model.tar.gz'

### Deploy the model for endpoint

In [22]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = artifacts,
    role = "arn:aws:iam::221082216147:role/sagameker_access",
    entry_point = "script.py",
    framework_version = FRAMEWORK_VERSION
)

In [23]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x170d56f86a0>

In [24]:
### Endpoint deployment

endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName = {}".format(endpoint_name))
      
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m4.xlarge",
    endpoint_name = endpoint_name
      )

EndpointName = Custom-sklearn-model-2025-04-28-15-24-25
[2;36m[04/28/25 17:24:27][0m[2;36m [0m[1;94mINFO    [0m Creating model with name:          [2msession.py[0m[2m:[0m[2m4094[0m
[2;36m                    [0m         Custom-sklearn-model-[1;36m2025[0m-[1;36m04[0m-[1;36m28[0m-[1;36m15[0m [2m               [0m
[2;36m                    [0m         -[1;36m24[0m-[1;36m25[0m                             [2m               [0m
[2;36m[04/28/25 17:24:28][0m[2;36m [0m[1;94mINFO    [0m Creating endpoint-config with name [2msession.py[0m[2m:[0m[2m6019[0m
[2;36m                    [0m         Custom-sklearn-model-[1;36m2025[0m-[1;36m04[0m-[1;36m28[0m-[1;36m15[0m [2m               [0m
[2;36m                    [0m         -[1;36m24[0m-[1;36m25[0m                             [2m               [0m
[2;36m[04/28/25 17:24:29][0m[2;36m [0m[1;94mINFO    [0m Creating endpoint with name        [2msession.py[0m[2m:[0m[2m4841[0m
[2;3

In [33]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x170d56af610>

In [None]:
sm_boto3.delete_endpoint(EndpointName = endpoint_name)