# Configurações

In [2]:
import numpy as np
import pandas as pd

# Lê o dataset

In [4]:
data = pd.read_csv('./datasets/bank-additional/bank-additional-full.csv', sep=';')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:10] # Show the first 10 lines

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,198,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,139,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,217,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# Splita o dataset em treino e teste

In [5]:
train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
                                    [int(0.95 * len(data)), int(len(data))]) 

print(train_data.shape, test_data.shape)

# Save to CSV files
train_data.to_csv('automl-train.csv', index=False, header=True, sep=',') # Need to keep column names
test_data.to_csv('automl-test.csv', index=False, header=True, sep=',')

(39128, 21) (2060, 21)


# Upload a parte de treino para o Amazon S3

In [6]:
import sagemaker

prefix = 'sagemaker/DEMO-automl-dm/input'
sess   = sagemaker.Session()

uri = sess.upload_data(path="automl-train.csv", key_prefix=prefix)
print(uri)

s3://sagemaker-us-east-2-325011675573/sagemaker/DEMO-automl-dm/input/automl-train.csv


# AutoPilot
(Agora faça o Auto Pilot com o assistente gráfico, e depois teste o serving para fazer a predição)


# Predizendo aleatório (com o test data set)

In [39]:
import boto3,sys, random

sm_rt = boto3.Session().client('runtime.sagemaker')

In [37]:
with open('automl-test.csv') as f:
    lines = f.readlines()
    print("Records: ", len(lines))

Records:  2061


In [45]:
escolhido = random.randint(1, len(lines))
l = lines[escolhido]

print("Escolhido registro: ", escolhido)
print("Conteúdo: {0}".format(l))
l = l.split(',')  # Split CSV line into features
label = l[-1]     # Store 'yes'/'no' label
l = l[:-1]        # Remove label
l = ','.join(l)   # Rebuild CSV line without label

Escolhido registro:  1688
Conteúdo: 53,blue-collar,married,unknown,unknown,yes,no,cellular,jul,fri,386,2,999,0,nonexistent,1.4,93.91799999999999,-42.7,4.962,5228.1,no



In [49]:
response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=l)

response = response['Body'].read().decode("utf-8")
print("Predição: {0}".format(response))

Predição: no



# Predizendo em lote (com o test data set)

In [53]:
ep_name = "ep-automl01"

In [54]:
import boto3,sys

sm_rt = boto3.Session().client('runtime.sagemaker')

In [55]:
tp = tn = fp = fn = count = 0

with open('automl-test.csv') as f:
    lines = f.readlines()
    print("Records: ", len(lines))
    for l in lines[1:]:   # Skip header
        l = l.split(',')  # Split CSV line into features
        label = l[-1]     # Store 'yes'/'no' label
        l = l[:-1]        # Remove label
        l = ','.join(l)   # Rebuild CSV line without label
                
        response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=l)

        response = response['Body'].read().decode("utf-8")
        #print ("label %s response %s" %(label,response))

        if 'yes' in label:
            # Sample is positive
            if 'yes' in response:
                # True positive
                tp=tp+1
            else:
                # False negative
                fn=fn+1
        else:
            # Sample is negative
            if 'no' in response:
                # True negative
                tn=tn+1
            else:
                # False positive
                fp=fp+1
        count = count+1
        if (count % 100 == 0):   
            sys.stdout.write(str(count)+' ')
            
print ("Feito")

Records:  2061
100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 Feito


In [57]:
print ("%d %d" % (tn, fp))
print ("%d %d" % (fn, tp))

accuracy  = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall    = tn/(tn+fn)
f1        = (2*precision*recall)/(precision+recall)

print ("""
Acurácia: %.4f 
Precisão: %.4f
Recal...: %.4f
F1......: %.4f""" % (accuracy, precision, recall, f1))

1760 62
118 120

Acurácia: 0.9126 
Precisão: 0.6593
Recal...: 0.9372
F1......: 0.7741


In [58]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

arn:aws:iam::325011675573:role/service-role/AmazonSageMaker-ExecutionRole-20200325T162508
