In [None]:
import sagemaker
import boto3
from sklearn.model_selection import train_test_split
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve the values from the environment
SAGEMAKER_ACCESS_KEY = os.getenv('SAGEMAKER_ACCESS_KEY')
SAGEMAKER_SECRET_KEY = os.getenv('SAGEMAKER_SECRET_KEY')
SAGEMAKER_ROLE= os.getenv('SAGEMAKER_ROLE')
region_name = 'us-east-1'

# Create a boto3 session with the provided credentials
boto3_session = boto3.Session(
    aws_access_key_id=SAGEMAKER_ACCESS_KEY,
    aws_secret_access_key=SAGEMAKER_SECRET_KEY,
    REGION_NAME=region_name
)

# Create a SageMaker client using boto3 with the correct region
sm_boto3 = boto3.client('sagemaker', REGION_NAME=region_name)

# Create a SageMaker session
session = sagemaker.Session(boto_session=boto3_session)

# Get the AWS region from the session (should match the specified region)
# region = session.boto_session.REGION_NAME

# Define the S3 bucket name
BUCKET = 'sagemakermlops'

# Print the region and bucket name
print(f"Region: {region_name}")
print(f"Bucket: {BUCKET}")


Your .env file should be configured with the following:

```bash
SAGEMAKER_ACCESS_KEY=YourInfoHere
SAGEMAKER_SECRET_KEY=YourInfoHere
SAGEMAKER_ROLE=YourInfoHere
```

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# ['Low_Risk','High_Risk], [0,1]
df['price_range'].value_counts(normalize=True)

In [None]:
df.columns

In [None]:
df.isnull().mean()*100

In [None]:
features = list(df.columns)

In [None]:
label = features.pop(-1)
label

In [None]:
x = df[features]
y = df[label]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x.shape

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [None]:
trainX.shape, testX.shape

In [None]:
trainX.head()

In [None]:
trainX.isnull().sum()

In [None]:
testX.isnull().sum()

In [None]:
trainX.to_csv('data/train-V-1.csv', index=False)
testX.to_csv('data/test-V-1.csv', index=False)

In [None]:
# send data to S3 for Sagemaker
sk_prefix = 'sagemaker/mobile_price_classification/sklearncontainer'
trainpath = session.upload_data(
    path='data/train-V-1.csv', 
    bucket=bucket,
    key_prefix=sk_prefix
)

testpath = session.upload_data(
    path='data/test-V-1.csv', 
    bucket=bucket,
    key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import argparse
import os
import numpy as np
import pandas as pd 

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return clf

if __name__ == '__main__':
    
    print('[INFO] Extracting arguments')
    parser = argparse.ArgumentParser()
    
    # Hyperparameters sent by the client are passed as command-line arguments
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=42)
    
    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument('--train', type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument('--test', type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument('--train-file', type=str, default="train-V-1.csv")
    parser.add_argument('--test-file', type=str, default="test-V-1.csv")
    
    args, _ = parser.parse_known_args()
    
    print(f'Sklearn Version: {sklearn.__version__}')
    print(f'Joblib Version: {joblib.__version__}')
    
    print('[INFO] Reading data')
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = train_df.columns.tolist()
    label = features.pop(-1)
    
    print('Building training and testing datasets')
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]
    
    print('Column order:')
    print(features)
    print()
    
    print('Data Shape:')
    print()
    print('---- SHAPE OF TRAINING DATA ----')
    print(X_train.shape)
    print(y_train.shape)
    print()
    print('---- SHAPE OF TESTING DATA ----')
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print('Training RandomForest Model....')
    print()
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    print(f'Model persisted at {model_path}')
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)
    
    print()
    print('---- METRICS RESULTS FOR TESTING DATA ----')
    print()
    print(f'Total Rows are: {X_test.shape[0]}')
    print(f'[TESTING] Model Accuracy is: {test_acc}')
    print(test_rep)


In [None]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION='0.23-1'

sklearn_estimator= SKLearn(
    entry_point='script.py',
    role=SAGEMAKER_ROLE,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='RF-custom-sklearn',
    hyperparameters={
        'n_estimators': 100,
        'random_state': 0
    },
    use_spot_instance = True,
    max_wait = 7200,
    max_run = 3600,
    sagemaker_session=session 
)

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapathh}, wait=True)


In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)['ModelArtifacts']['S3ModelArtifacts']

