### Batch Transform on input data with Deployed Model

#### Pulling in best model

In [6]:
import sagemaker
import boto3
from sagemaker import get_execution_role


region = boto3.Session().region_name
session = sagemaker.Session()
le = get_execution_role()

# This is the client we will use to interact with SageMaker AutoPilot
sm = boto3.Session().client(service_name='sagemaker',region_name=region)

# auto_ml_job_name = tuning-job-1-02126272d1024e56bc-178-233ae13b-aws-trial
best_candidate = sm.describe_auto_ml_job(AutoMLJobName='Funders-USA-SEC-data-MVP-1')['BestCandidate']
display(best_candidate)

{'CandidateName': 'tuning-job-1-02126272d1024e56bc-178-233ae13b',
 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:accuracy',
  'Value': 0.5882300138473511},
 'ObjectiveStatus': 'Succeeded',
 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:570124035543:processing-job/db-1-536cceb09e614d6caa492e76de253c0340dbe419674d48dbb465b7a4b1',
   'CandidateStepName': 'db-1-536cceb09e614d6caa492e76de253c0340dbe419674d48dbb465b7a4b1'},
  {'CandidateStepType': 'AWS::SageMaker::TrainingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:570124035543:training-job/funders-us-dpp0-1-58ce671cb58f43028d8f860f32960e945b9a210e4b8c4',
   'CandidateStepName': 'Funders-US-dpp0-1-58ce671cb58f43028d8f860f32960e945b9a210e4b8c4'},
  {'CandidateStepType': 'AWS::SageMaker::TransformJob',
   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:570124035543:transform-job/funders-us-dpp0-csv-1-2d7dfb3d81a74ce5bd25436f3683c67639a3

In [168]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
import pandas as pd
import numpy as np


predictor = Predictor(
    endpoint_name='Funders-USA-SEC-data-MVP-1-ep-med1',
    sagemaker_session=session,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer())

# Load in ground truth dataset
ds_g_truth_path = '/root/Funders-USA-ML-Backend/datasets/datasets_SEC/SEC-MVP-1-Dataset.csv'
df_gt = pd.read_csv(ds_g_truth_path, header=0)
display(df_gt.head(2))
display(len(df_gt))

# Load in dataset with no target column
path = '/root/Funders-USA-ML-Backend/datasets/datasets_SEC/SEC-MVP-1-Dataset-no-target-col.csv'
df_no_tar_col = pd.read_csv(path)
display(df_no_tar_col.head(2))


    


Unnamed: 0,Funding_Success,OFFERINGAMOUNT,MAXIMUMOFFERINGAMOUNT,CURRENTEMPLOYEES,TOTALASSETMOSTRECENTFISCALYEAR,TOTALASSETPRIORFISCALYEAR,CASHEQUIMOSTRECENTFISCALYEAR,CASHEQUIPRIORFISCALYEAR,ACTRECEIVEDRECENTFISCALYEAR,ACTRECEIVEDPRIORFISCALYEAR,SHORTTERMDEBTMRECENTFISCALYEAR,SHORTTERMDEBTPRIORFISCALYEAR,LONGTERMDEBTRECENTFISCALYEAR,LONGTERMDEBTPRIORFISCALYEAR,NETINCOMEMOSTRECENTFISCALYEAR,NETINCOMEPRIORFISCALYEAR,Form_C_Number
0,1,275000.0,500000.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,400000.0,1000000.0,3,230909.0,441532.0,193376.0,432140.0,-1097.0,0.0,589120.0,569251.0,230909.0,441532.0,-298573.0,-361237.0,1


1702

Unnamed: 0,OFFERINGAMOUNT,MAXIMUMOFFERINGAMOUNT,CURRENTEMPLOYEES,TOTALASSETMOSTRECENTFISCALYEAR,TOTALASSETPRIORFISCALYEAR,CASHEQUIMOSTRECENTFISCALYEAR,CASHEQUIPRIORFISCALYEAR,ACTRECEIVEDRECENTFISCALYEAR,ACTRECEIVEDPRIORFISCALYEAR,SHORTTERMDEBTMRECENTFISCALYEAR,SHORTTERMDEBTPRIORFISCALYEAR,LONGTERMDEBTRECENTFISCALYEAR,LONGTERMDEBTPRIORFISCALYEAR,NETINCOMEMOSTRECENTFISCALYEAR,NETINCOMEPRIORFISCALYEAR,Form_C_Number
0,275000.0,500000.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,400000.0,1000000.0,3,230909.0,441532.0,193376.0,432140.0,-1097.0,0.0,589120.0,569251.0,230909.0,441532.0,-298573.0,-361237.0,1


In [84]:
# Obtain predictions from SageMaker endpoint
prediction = predictor.predict(df_no_tar_col.to_csv(sep=',', header=None, index=False))


In [85]:
prediction_array = np.array(prediction)
display(prediction_array)
display(prediction_array[:,0])
only_preds = prediction_array[:,0]

array([['0', "['1', '0']", '[0.4528607130050659, 0.5471392869949341]'],
       ['1', "['1', '0']", '[0.6817927956581116, 0.3182072043418884]'],
       ['1', "['1', '0']", '[0.5831929743289948, 0.41680702567100525]'],
       ...,
       ['1', "['1', '0']", '[0.6199167370796204, 0.38008326292037964]'],
       ['1', "['1', '0']", '[0.5245495438575745, 0.47545045614242554]'],
       ['1', "['1', '0']", '[0.6412034630775452, 0.35879653692245483]']],
      dtype='<U41')

array(['0', '1', '1', ..., '1', '1', '1'], dtype='<U41')

In [169]:
# # Load prediction in pandas and compare to ground truth
prediction_df = pd.DataFrame(only_preds)
display(prediction_df.head(2))

df_gt.insert(1,'Prediction', prediction_df)
df_gt = df_gt.astype({'Prediction': int})

    


Unnamed: 0,0
0,0
1,1


In [173]:
xdf_gt = df_gt.copy()
display(xdf_gt.head(5))
df_yes_no = xdf_gt['Funding_Success'] == xdf_gt['Prediction']
display(df_yes_no.head(2))
xdf_gt.insert(2,'Correct', df_yes_no)
xdf_gt.loc[xdf_gt.Correct == False, 'Correct'] = 'No'
xdf_gt.loc[xdf_gt.Correct == True, 'Correct'] = 'Yes'
display(xdf_gt.head(5))




Unnamed: 0,Funding_Success,Prediction,OFFERINGAMOUNT,MAXIMUMOFFERINGAMOUNT,CURRENTEMPLOYEES,TOTALASSETMOSTRECENTFISCALYEAR,TOTALASSETPRIORFISCALYEAR,CASHEQUIMOSTRECENTFISCALYEAR,CASHEQUIPRIORFISCALYEAR,ACTRECEIVEDRECENTFISCALYEAR,ACTRECEIVEDPRIORFISCALYEAR,SHORTTERMDEBTMRECENTFISCALYEAR,SHORTTERMDEBTPRIORFISCALYEAR,LONGTERMDEBTRECENTFISCALYEAR,LONGTERMDEBTPRIORFISCALYEAR,NETINCOMEMOSTRECENTFISCALYEAR,NETINCOMEPRIORFISCALYEAR,Form_C_Number
0,1,0,275000.0,500000.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1,400000.0,1000000.0,3,230909.0,441532.0,193376.0,432140.0,-1097.0,0.0,589120.0,569251.0,230909.0,441532.0,-298573.0,-361237.0,1
2,1,1,50000.0,1000000.0,10,5015954.0,0.0,1250.0,0.0,0.0,0.0,267070.0,0.0,0.0,0.0,-230547.0,0.0,1
3,1,0,250000.0,1000000.0,3,10170.0,7321.0,10170.0,7321.0,0.0,0.0,0.0,0.0,0.0,0.0,-36772.0,-2250.0,1
4,1,0,500000.0,1000000.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


0    False
1     True
dtype: bool

Unnamed: 0,Funding_Success,Prediction,Correct,OFFERINGAMOUNT,MAXIMUMOFFERINGAMOUNT,CURRENTEMPLOYEES,TOTALASSETMOSTRECENTFISCALYEAR,TOTALASSETPRIORFISCALYEAR,CASHEQUIMOSTRECENTFISCALYEAR,CASHEQUIPRIORFISCALYEAR,ACTRECEIVEDRECENTFISCALYEAR,ACTRECEIVEDPRIORFISCALYEAR,SHORTTERMDEBTMRECENTFISCALYEAR,SHORTTERMDEBTPRIORFISCALYEAR,LONGTERMDEBTRECENTFISCALYEAR,LONGTERMDEBTPRIORFISCALYEAR,NETINCOMEMOSTRECENTFISCALYEAR,NETINCOMEPRIORFISCALYEAR,Form_C_Number
0,1,0,No,275000.0,500000.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1,Yes,400000.0,1000000.0,3,230909.0,441532.0,193376.0,432140.0,-1097.0,0.0,589120.0,569251.0,230909.0,441532.0,-298573.0,-361237.0,1
2,1,1,Yes,50000.0,1000000.0,10,5015954.0,0.0,1250.0,0.0,0.0,0.0,267070.0,0.0,0.0,0.0,-230547.0,0.0,1
3,1,0,No,250000.0,1000000.0,3,10170.0,7321.0,10170.0,7321.0,0.0,0.0,0.0,0.0,0.0,0.0,-36772.0,-2250.0,1
4,1,0,No,500000.0,1000000.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [174]:
display(xdf_gt.head(50))

Unnamed: 0,Funding_Success,Prediction,Correct,OFFERINGAMOUNT,MAXIMUMOFFERINGAMOUNT,CURRENTEMPLOYEES,TOTALASSETMOSTRECENTFISCALYEAR,TOTALASSETPRIORFISCALYEAR,CASHEQUIMOSTRECENTFISCALYEAR,CASHEQUIPRIORFISCALYEAR,ACTRECEIVEDRECENTFISCALYEAR,ACTRECEIVEDPRIORFISCALYEAR,SHORTTERMDEBTMRECENTFISCALYEAR,SHORTTERMDEBTPRIORFISCALYEAR,LONGTERMDEBTRECENTFISCALYEAR,LONGTERMDEBTPRIORFISCALYEAR,NETINCOMEMOSTRECENTFISCALYEAR,NETINCOMEPRIORFISCALYEAR,Form_C_Number
0,1,0,No,275000.0,500000.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1,Yes,400000.0,1000000.0,3,230909.0,441532.0,193376.0,432140.0,-1097.0,0.0,589120.0,569251.0,230909.0,441532.0,-298573.0,-361237.0,1
2,1,1,Yes,50000.0,1000000.0,10,5015954.0,0.0,1250.0,0.0,0.0,0.0,267070.0,0.0,0.0,0.0,-230547.0,0.0,1
3,1,0,No,250000.0,1000000.0,3,10170.0,7321.0,10170.0,7321.0,0.0,0.0,0.0,0.0,0.0,0.0,-36772.0,-2250.0,1
4,1,0,No,500000.0,1000000.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,1,1,Yes,99840.0,901888.0,5,1153234.0,694289.0,58131.0,1000.0,0.0,0.0,79397.0,108455.0,0.0,0.0,-1922699.0,-1079379.0,3
6,0,0,Yes,100000.0,1000000.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0,0,Yes,100000.0,1000000.0,0,54008.0,54129.0,2156.0,4414.0,5180.0,0.0,0.0,0.0,87504.0,74269.0,43556.0,87834.0,1
8,0,0,Yes,80000.0,100000.0,2,35873.0,0.0,27303.0,0.0,0.0,0.0,10202.0,0.0,63799.0,0.0,-48127.0,0.0,1
9,1,1,Yes,20000.0,100000.0,2,12461.0,35873.0,3877.0,27303.0,0.0,0.0,0.0,0.0,92150.0,63799.0,-102832.0,-48127.0,2


In [114]:
accuracy = (df_gt.reset_index()['Funding_Success'] == df_gt['Prediction']).sum() / len(df_gt)
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.0
