# An Example of Calling Batch Transform Job on the model SageMaker estimator made

In [24]:
%store -r

In [25]:
import boto3

from sagemaker.transformer import Transformer
import pandas as pd
import sagemaker

from time import strftime, gmtime

sess = sagemaker.Session()

role = sagemaker.get_execution_role()
client = boto3.client('sagemaker')

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
transform_job_name = 'churn-batch-transform-' + timestamp_prefix

output_prefix = 'output'
output_path = 's3://{}/{}/{}'.format(bucket, prefix, output_prefix)

# Make Transformer object
transformer = Transformer(base_transform_job_name = transform_job_name,
                          model_name = model_name,
                          instance_count=1,
                          instance_type='ml.m4.xlarge',
                          output_path= output_path
                         )



In [26]:
import os

s3_input_test_url = s3_input_test.config['DataSource']['S3DataSource']['S3Uri']
s3_input_test_url = os.path.join(s3_input_test_url, 'batch_transform_test.csv')
print(s3_input_test_url)

s3://sagemaker-us-east-2-057716757052/sagemaker/customer-churn/rawtest/batch_transform_test.csv


In [27]:
! aws s3 ls {s3_input_test_url} --recursive

2020-07-15 14:39:18      29043 sagemaker/customer-churn/rawtest/batch_transform_test.csv


In [28]:

transformer.transform(s3_input_test_url,
                      content_type='text/csv'
                     )



In [29]:
transformer.wait()

..........................[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: preprocessing
  Building wheel for preprocessing (setup.py): started[0m
[35mProcessing /opt/ml/code[0m
[35mBuilding wheels for collected packages: preprocessing
  Building wheel for preprocessing (setup.py): started[0m
[32mArguments: serve[0m
[32m[2020-07-16 00:21:50 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[32m[2020-07-16 00:21:50 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[32m[2020-07-16 00:21:50 +0000] [1] [INFO] Using worker: gevent[0m
[32m[2020-07-16 00:21:50 +0000] [38] [INFO] Booting worker with pid: 38[0m
[32m[2020-07-16 00:21:50 +0000] [39] [INFO] Booting worker with pid: 39[0m
[32m[2020-07-16 00:21:50 +0000] [40] [INFO] Booting worker with pid: 40[0m
[32m[2020-07-16 00:21:51 +0000] [41] [INFO] Booting worker with pid: 41[0m
[32m[2020-07-16:00:21:51:INFO] Model loaded successfully for worker : 38[0m
[32m[2020-07-16:00:21:51:INFO] 

In [30]:
! aws s3 ls {output_path} --recursive

2020-07-15 07:10:13       1961 sagemaker/customer-churn/output-transform-test/batch_transform_test.csv.out
2020-07-16 00:22:02       1961 sagemaker/customer-churn/output/batch_transform_test.csv.out
2020-07-15 06:40:43      34149 sagemaker/customer-churn/output/xgboost-2020-07-15-06-37-54-125/output/model.tar.gz
2020-07-15 08:27:16      34148 sagemaker/customer-churn/output/xgboost-2020-07-15-08-24-03-185/output/model.tar.gz
2020-07-15 09:30:57      34150 sagemaker/customer-churn/output/xgboost-2020-07-15-09-28-18-062/output/model.tar.gz
2020-07-15 14:43:05      34148 sagemaker/customer-churn/output/xgboost-2020-07-15-14-39-57-529/output/model.tar.gz
2020-07-16 00:16:00      34148 sagemaker/customer-churn/output/xgboost-2020-07-16-00-12-32-654/output/model.tar.gz


In [31]:
import os
os.makedirs('output', exist_ok=True)

! aws s3 cp {output_path}/batch_transform_test.csv.out 'output/batch_transform_test_output.csv'

download: s3://sagemaker-us-east-2-057716757052/sagemaker/customer-churn/output/batch_transform_test.csv.out to output/batch_transform_test_output.csv


In [32]:

df = pd.read_csv("output/batch_transform_test_output.csv", names=["Churn"])

# print(df.info)
# df

df.Churn.value_counts()

False    291
True      43
Name: Churn, dtype: int64

## 실제 데이타와 예측 결과 비교

In [33]:
df_true = pd.read_csv("churn_data/batch_transform_test_true.csv", header=None)
df_true_label = df_true.iloc[:,-1]

In [34]:
# Replace "False." --> False, "True." --> True
def clean_raw_data(churn_raw):
    df_churn = churn_raw.copy()
#     df_churn = df_churn.replace({'Churn?': {'True.' : 'True', 'False.' : 'False'}})
    df_churn = df_churn.replace({'True.' : True, 'False.' : False})
    
    return df_churn

df_true = clean_raw_data(df_true_label)
df_true.value_counts()

False    286
True      48
Name: 20, dtype: int64

In [35]:
data = { 'y_actual' : df.Churn.to_list(),
         'y_predict' : df_true.to_list()
       }
df_churn = pd.DataFrame(data, columns=['y_actual','y_predict' ])
df_churn.head(3)

Unnamed: 0,y_actual,y_predict
0,False,False
1,False,False
2,False,False


In [36]:
confusion_matrix = pd.crosstab(df_churn['y_actual'], 
                               df_churn['y_predict'], 
                               rownames=['Actual'],
                               colnames=['Predict'])
confusion_matrix

Predict,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,282,9
True,4,39


In [37]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
print("f1 metric: ", f1_score(df_churn['y_actual'],df_churn['y_predict'] ))
print("precision : ", precision_score(df_churn['y_actual'],df_churn['y_predict'] ))
print("recall_score : ", recall_score(df_churn['y_actual'],df_churn['y_predict'] ))
print("accuracy: ", accuracy_score(df_churn['y_actual'],df_churn['y_predict'] ))

f1 metric:  0.8571428571428572
precision :  0.8125
recall_score :  0.9069767441860465
accuracy:  0.9610778443113772
