In [1]:
import os
import numpy as np
import pandas as pd
import boto3
import sagemaker
import matplotlib.pyplot as plt
from sagemaker import LinearLearner

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.client('s3')

# create an S3 bucket
bucket = sagemaker_session.default_bucket()
#create a folder for output
output_folder = "output"
s3.put_object(Bucket=bucket, Key=output_folder)

{'ResponseMetadata': {'RequestId': '4711Y94N476PY2G8',
  'HostId': 'ZMBREtgC4Fi1cGFvXWK8itjmZkOtTzWa7KSDoKkJ0vuRfnMoOTncWoGnWMMSSewc4N2/PJ8nS5w=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ZMBREtgC4Fi1cGFvXWK8itjmZkOtTzWa7KSDoKkJ0vuRfnMoOTncWoGnWMMSSewc4N2/PJ8nS5w=',
   'x-amz-request-id': '4711Y94N476PY2G8',
   'date': 'Sun, 23 Jan 2022 16:14:19 GMT',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"'}

In [16]:
output_path = 's3://{}/{}'.format(bucket, output_folder)
output_path

's3://sagemaker-us-east-1-456758360141/output'

In [35]:
key = 'cleaned_data/cleaned_data.csv'
data_location = 's3://{}/{}'.format(bucket,key)
df = pd.read_csv(data_location)

In [18]:
X = df.drop(['offer_successful'], axis=1)
y = df['offer_successful']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
#Scaling the data for some algorithms
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

In [21]:
y_train_np = y_train.to_numpy()
y_train_np

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
np.unique(y_train_np)

array([0, 1])

In [23]:
y_test_np = y_test.to_numpy()
y_test_np

array([0, 0, 0, ..., 1, 0, 1])

In [24]:
X_train_scaled = X_train_scaled.astype('float32')
X_test_scaled = X_test_scaled.astype('float32')
y_train_np = y_train_np.astype('float32')
y_test_np = y_test_np.astype('float32')

In [25]:
linear = LinearLearner(role = role,
                      train_instance_count = 1,
                      train_instance_type='ml.c4.xlarge',
                      predictor_type='binary_classifier',
                      num_classes=2,
                      output_path=output_path,
                      sagemaker_session=sagemaker_session,
                      epochs = 5)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [26]:
train_data = linear.record_set(X_train_scaled, labels=y_train_np)

In [27]:
%%time
linear.fit(train_data, wait=True)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-01-23 16:19:09 Starting - Starting the training job...
2022-01-23 16:19:33 Starting - Launching requested ML instancesProfilerReport-1642954749: InProgress
......
2022-01-23 16:20:33 Starting - Preparing the instances for training............
2022-01-23 16:22:36 Downloading - Downloading input data...
2022-01-23 16:23:09 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/23/2022 16:23:15 INFO 140540794976064] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'in

In [28]:
%%time
# deploy your model to create a predictor
predictor = linear.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


----------!CPU times: user 201 ms, sys: 4.87 ms, total: 206 ms
Wall time: 5min 1s


In [29]:
prediction = predictor.predict(X_test_scaled)

In [30]:
y_prediction = np.array([x.label['predicted_label'].float32_tensor.values[0] for x in prediction])
y_prediction

array([0., 0., 0., ..., 1., 0., 1.])

In [31]:
# test that your model generates the correct number of labels
assert len(y_prediction)==len(y_test_np), 'Unexpected number of predictions.'
print('Test passed!')

Test passed!


In [32]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, lr_pred)

1.0

In [34]:
confusion_matrix(y_test, lr_pred)

array([[21066,     0],
       [    0,  6470]])

This shows the data was carefully handled and trained. The benchmark model evaluated with a metrics of 100%. Its time to evaluate the deployed endpoint using lambda function.