# Mobile price classification using sklearn custom script in sagemaker


In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3 # library used to connect with s3 bucket
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "mobbucketsagemakerxyz"
print("Using bucket: "+ bucket)


Using bucket: mobbucketsagemakerxyz


In [7]:
df = pd.read_csv("train.csv")

In [8]:
df.head(10)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
5,1859,0,0.5,1,3,0,22,0.7,164,1,...,1004,1654,1067,17,1,10,1,0,0,1
6,1821,0,1.7,0,4,1,10,0.8,139,8,...,381,1018,3220,13,8,18,1,0,1,3
7,1954,0,0.5,1,0,0,24,0.8,187,4,...,512,1149,700,16,3,5,1,1,1,0
8,1445,1,0.5,0,0,0,53,0.7,174,7,...,386,836,1099,17,1,20,1,0,0,0
9,509,1,0.6,1,2,1,9,0.1,93,5,...,1137,1224,513,19,10,12,1,0,0,0


In [11]:
df.shape

(2000, 21)

In [13]:
# ['Low_Risk','High_Risk'],[0,1] 
df['price_range'].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [14]:
# ['Low_Risk','High_Risk'],[0,1] 
df['price_range'].value_counts(normalize=True)

price_range
1    0.25
2    0.25
3    0.25
0    0.25
Name: proportion, dtype: float64

In [15]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [16]:
df.shape

(2000, 21)

In [18]:
df.isnull().mean()*200

battery_power    0.0
blue             0.0
clock_speed      0.0
dual_sim         0.0
fc               0.0
four_g           0.0
int_memory       0.0
m_dep            0.0
mobile_wt        0.0
n_cores          0.0
pc               0.0
px_height        0.0
px_width         0.0
ram              0.0
sc_h             0.0
sc_w             0.0
talk_time        0.0
three_g          0.0
touch_screen     0.0
wifi             0.0
price_range      0.0
dtype: float64

In [22]:
features = list(df.columns)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

In [23]:
label = features.pop(-1) # remove the last feature
label

'price_range'

In [26]:
x = df[features]
y = df[label]

In [28]:
x.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [29]:
y.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [30]:
x.shape

(2000, 20)

In [31]:
y.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.15,random_state=8)

In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_train.shape)

(1700, 20)
(300, 20)
(1700,)
(1700,)


In [34]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [35]:
print(trainX.shape)
print(testX.shape)

(1700, 21)
(300, 21)


In [37]:
trainX.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
385,1880,1,1.8,0,4,1,18,0.7,138,3,...,71,699,3333,17,15,7,1,0,0,3
1180,1479,1,1.8,1,0,1,24,0.1,146,6,...,327,645,3762,12,10,5,1,0,0,3
1407,1617,0,1.5,0,1,1,63,0.7,111,2,...,274,1079,2754,19,10,8,1,1,1,2
624,1919,0,1.5,0,5,1,48,0.8,150,4,...,304,1191,1391,19,13,12,1,0,1,1
1602,1494,0,1.6,1,4,1,24,0.2,101,1,...,979,1190,3614,15,12,9,1,0,0,3


In [38]:
trainX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [39]:
testX.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [40]:
trainX.to_csv("train-V-1.csv",index=False)
testX.to_csv("test-V-1.csv",index=False)

In [52]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://mobbucketsagemakerxyz/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://mobbucketsagemakerxyz/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


In [53]:
print(trainpath)
print(testpath)

s3://mobbucketsagemakerxyz/sagemaker/mobile_price_classification/sklearncontainer/train-V-1.csv
s3://mobbucketsagemakerxyz/sagemaker/mobile_price_classification/sklearncontainer/test-V-1.csv


In [54]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [57]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::851614451587:role/SageMakerExecutionRole",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [58]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2025-05-26-18-35-51-642


2025-05-26 18:36:02 Starting - Starting the training job...
2025-05-26 18:36:37 Downloading - Downloading input data...
2025-05-26 18:37:02 Downloading - Downloading the training image...
2025-05-26 18:37:53 Training - Training image download completed. Training in progress.
2025-05-26 18:37:53 Uploading - Uploading generated training model2025-05-26 18:37:47,140 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-05-26 18:37:47,144 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-26 18:37:47,189 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-05-26 18:37:47,340 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-26 18:37:47,352 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-26 18:37:47,364 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-05-26 18:37:47

In [59]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2025-05-26 18:38:06 Starting - Preparing the instances for training
2025-05-26 18:38:06 Downloading - Downloading the training image
2025-05-26 18:38:06 Training - Training image download completed. Training in progress.
2025-05-26 18:38:06 Uploading - Uploading generated training model
2025-05-26 18:38:06 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-851614451587/RF-custom-sklearn-2025-05-26-18-35-51-642/output/model.tar.gz


In [60]:
artifact

's3://sagemaker-us-east-1-851614451587/RF-custom-sklearn-2025-05-26-18-35-51-642/output/model.tar.gz'

In [65]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::851614451587:role/SageMakerExecutionRole",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [66]:
model_name

'Custom-sklearn-model-2025-05-26-19-18-14'

In [67]:
##Endpoints deployment
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2025-05-26-19-18-19


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2025-05-26-19-18-14
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2025-05-26-19-18-19
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2025-05-26-19-18-19


------------!

In [68]:
endpoint_name

'Custom-sklearn-model-2025-05-26-19-18-19'

In [69]:
testX[features][0:2].values.tolist()

[[649.0,
  1.0,
  1.4,
  0.0,
  4.0,
  1.0,
  19.0,
  0.8,
  190.0,
  7.0,
  12.0,
  344.0,
  1551.0,
  739.0,
  18.0,
  1.0,
  2.0,
  1.0,
  1.0,
  0.0],
 [1925.0,
  0.0,
  3.0,
  1.0,
  0.0,
  1.0,
  16.0,
  0.8,
  175.0,
  8.0,
  11.0,
  983.0,
  1087.0,
  2173.0,
  7.0,
  3.0,
  19.0,
  1.0,
  1.0,
  0.0]]

In [70]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[0 2]


In [71]:

sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '60a481f6-7f10-4f40-9a24-e23983e4060c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '60a481f6-7f10-4f40-9a24-e23983e4060c',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 26 May 2025 19:27:15 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}