In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pandas

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "mobbucketsagemaker"
print("Using bucket " + bucket)

In [None]:
print(region)

In [None]:
df = pd.read_csv("mob_price_classification_train.csv")
df.head()

In [None]:
df['price_range'].value_counts()

In [None]:
features = list(df.columns)
features

In [None]:
x = df[features]
y = df[label]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

In [None]:
trainX = pd.DataFrame(X_train)
trainX

In [None]:
trainX.to_csv("train-V-1.csv", index = False)
testX.to_csv("test-V-1.csv", index = False)

In [None]:
# Send data to S3. Sagemaker will take the data for training from s3
sk_prefix = "sagemaker/mobile-price-classification/sklearncontainer"
trainpath = sess.upload_data(path="train-V-1.csv", bucket = bucket, key_prefix = sk_prefix)
testpath = sess.upload_data(path="test-V-1.csv", bucket = bucket, key_prefix = sk_prefix)

# Script used by AWS SageMaker to train models

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))

if __name__ == "__main__":
    print("[Info] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters
    parser.add_argument("--n_estimators", type = int, default = 100)
    parser.add_argument("--random_state", type = int, default = 0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type = str, default = os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type = str, default = os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type = str, default = os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type = str, default = "train-V-1.csv")
    parser.add_argument("--test-file", type = str, default = "train-V-1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print("Building the train and test datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()

    print('Label column is: ', label)
    print()

    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()

    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()

    print("Training Random Forest Model . . . . ")
    print()
    model = RandomForestClassifier(n_estimators = args.n_estimators, random_state = args.random_state, verbose = 2, n_jobs = 1)
    model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)

    print("Model saved at " + model_path)

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total rows are: ", X_test.shape[0])
    print('[TESTING] Model accuracy is: ', test_acc)
    print('[TESTING] Testing report: ')
    print(test_rep)

: 