In [31]:
bucket = 'datairis'
prefix = 'git'

# Define IAM role
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

role = get_execution_role()

In [32]:
dataset=pd.read_csv('s3://datairis/iris.csv')

In [33]:
dataset.head(3)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa


In [34]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
dataset['variety']=le.fit_transform(dataset['variety'])

#### FOR XGBOOST MODELS (ANY MODELS IN SAGEMAKER MODEL ) OUTPUT (DEPENDENT VARIABLE) SHOULD BE THE FIRST COLUMN

In [35]:
dataset = pd.concat([dataset['variety'], dataset.drop(['variety'], axis=1)], axis=1) 

In [36]:
dataset.head(3)

Unnamed: 0,variety,sepal.length,sepal.width,petal.length,petal.width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2


In [37]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

#### Uploading Data into s3

In [38]:
s3_input_train = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_validation = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

#### MAKING DATA IN CSV FORMAT . MODELS PREFERABLY USE LIBSVM OR CSV FORMAT.

In [39]:


s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')



#### GETTING LATEST UPDATED XGBOOST MODEL FROM YOUR REGION

In [40]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

sess = sagemaker.Session()

xgb_iris= sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

#### HYPERPARAMETERS OBTAINED FROM SAGEMAKER AUTOPILOT 

#### NOTE : 
You can set your Parameters using GridsearchCV Methods or Use SageMaker Autopilot

In [41]:
xgb_iris.set_hyperparameters(alpha=1.448983,colsample_bytree=0.6897649,eta=0.246274,gamma=0.546408,lamda=0.0003157054,
max_depth=18,min_child_weight=0.00282088,num_class=3,num_round=8, objective='multi:softmax',subsample=0.538571908)

In [42]:
xgb_iris.fit({'train':s3_input_train,'validation':s3_input_validation})

2020-04-24 17:32:41 Starting - Starting the training job...
2020-04-24 17:32:42 Starting - Launching requested ML instances......
2020-04-24 17:33:42 Starting - Preparing the instances for training...
2020-04-24 17:34:28 Downloading - Downloading input data...
2020-04-24 17:35:09 Training - Training image download completed. Training in progress.
2020-04-24 17:35:09 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2020-04-24:17:35:04:INFO] Running standalone xgboost training.[0m
[34m[2020-04-24:17:35:04:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8505.86mb[0m
[34m[2020-04-24:17:35:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:35:04] S3DistributionType set as FullyReplicated[0m
[34m[17:35:04] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-04-24:17:35:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:35:0

In [43]:
xgb_predictor_iris = xgb_iris.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!

In [45]:
test_data_array = test_data.drop(['variety'], axis=1).values #load the data into an array
xgb_predictor_iris.content_type = 'text/csv' # set the data type for an inference
xgb_predictor_iris.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor_iris.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(15,)


In [47]:
cm = pd.crosstab(index=test_data['variety'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Accuracy Rate: ", p))



Overall Accuracy Rate: 100.0%



#### PREDICTING FOR NEW VARIBALE

In [59]:
xgb_predictor_iris.predict([5.1,3.5,1.4,0.2])


b'0.0'