In [2]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer


In [3]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

In [4]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")


Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [5]:
# Run code
# d. Create the S3 bucket to store your data. Copy and paste the following code into the next code cell and choose Run.

# Note: Make sure to replace the bucket_name your-s3-bucket-name with a unique S3 bucket name. If you don't receive a success message after running the code, change the bucket name and try again.
bucket_name = 'mpgbucket123' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [6]:
bucket='mpgbucket123'
subfolder ='input'
from sagemaker import get_execution_role
role=get_execution_role()
conn=boto3.client('s3')
contents=conn.list_objects(Bucket=bucket,Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

input/
input/mpg.csv


In [7]:
try:
  urllib.request.urlretrieve ("https://shivamsksbucketmltest.s3.amazonaws.com/input/mpg.csv", "mpg.csv")
  print('Success: downloaded mpg.csv.')
except Exception as e:
  print('Data load error: ',e)

try:
  model_data = pd.read_csv('./mpg.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded mpg.csv.
Success: Data loaded into dataframe.


In [8]:
model_data["horsepower"]=model_data["horsepower"].replace("?",0)

total=0
count=0
for i in model_data["horsepower"]:
    if i!=0:
        count+=1
        total+=int(i)
model_data["horsepower"]=model_data["horsepower"].replace(0,total/count)
model_data["horsepower"]

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: object

In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(model_data, test_size = 0.2, shuffle = True)
print('Training dataset size is ', len(train))
print('Training dataset size is ', len(test))


Training dataset size is  318
Training dataset size is  80


In [14]:
features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']

In [15]:
train_x = train[features]
train_y = train['mpg']
test_x = train[features]
test_y = train['mpg']

In [16]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)

In [17]:
prediction = reg.predict(test_x)

In [19]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test_y, prediction)
print(np.array(test_y))
print(prediction)

[24.  21.  14.  22.  18.5 43.1 26.5 13.  38.  19.2 17.  14.  34.  32.8
 16.  26.  27.  26.  24.3 20.2 29.  33.  12.  30.5 30.  29.  22.  19.2
 28.  29.  25.  17.5 14.  20.  21.  18.  34.5 30.  34.3 27.2 29.  19.
 28.  19.4 19.  41.5 35.1 11.  23.  27.2 23.9 35.7 25.  28.  15.  36.
 20.  30.  20.6 31.5 23.2 15.  15.  20.  15.5 21.  20.3 15.5 30.  18.2
 11.  14.  31.  26.  13.  32.  31.  21.  21.  25.  17.6 25.  31.  18.6
 22.  24.  14.  18.  16.  23.  15.  20.  20.  37.  29.8 28.  34.1 25.5
 27.  31.  16.  28.  22.  17.7 28.  17.5 32.9 15.  23.9 22.  34.2 17.
 26.  31.6 26.  32.  18.  14.  36.  15.  17.  20.  19.  34.7 13.  30.
 19.2 25.1 24.  21.5 36.  24.  18.  27.  40.8 25.5 33.  39.1 31.8 33.7
 13.  34.  40.9 19.  18.  18.5 17.5 14.  14.  31.  13.  36.1 28.1 21.
 18.  26.  14.  26.  26.  16.5 34.1 21.1 23.  25.8 14.  43.4 27.  25.4
 33.5 23.8 17.  13.  13.  29.5 32.2 22.  16.  14.  31.5 28.  23.  38.
 21.5 14.  26.8 16.5 18.  33.  15.  16.  26.  19.  25.  11.  27.  32.7
 23.6 16.5 1