# Model Building

## Import the necessary packages

In [1]:
import pandas as pd
import numpy as np 
import os 



### Packages to interact with AWS services

In [2]:
from sagemaker.session import Session, get_execution_role

sagemaker_session = Session()
bucket = sagemaker_session.default_bucket()
role = get_execution_role()

In [14]:
import boto3

s3 = boto3.resource('s3')
for obj in s3.Bucket(bucket).objects.all():
    print(obj)

s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-20-42-43-911/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-20-45-59-086/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-20-48-12-454/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-20-48-36-910/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-20-49-08-681/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-21-01-34-363/source/sourcedir.tar.gz')
s3.ObjectSummary(bucket_name='sagemaker-eu-central-1-126514844793', key='sagemaker-scikit-learn-2020-04-11-21-02-14-545/debu

## Model Building

## Naive Bayes 

In [5]:
from sagemaker.sklearn import SKLearn

In [6]:
model = SKLearn(entry_point='train_sklearn_nb.py',
                source_dir='source_train',
                role=get_execution_role(),
                train_instance_count=1,
                train_instance_type='ml.m4.xlarge',
                )

In [7]:
key='udacityCapstone/data/vectorized_traindata.csv'
train_path = f's3://{bucket}/{key}'

input_channels = {"train":train_path }

In [9]:
model.fit(input_channels)

2020-04-12 14:22:06 Starting - Starting the training job...
2020-04-12 14:22:09 Starting - Launching requested ML instances...
2020-04-12 14:23:02 Starting - Preparing the instances for training...
2020-04-12 14:23:36 Downloading - Downloading input data...
2020-04-12 14:24:06 Training - Downloading the training image...
2020-04-12 14:24:25 Training - Training image download completed. Training in progress.[34m2020-04-12 14:24:26,082 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-04-12 14:24:26,085 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-12 14:24:26,097 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-04-12 14:24:26,335 sagemaker-containers INFO     Module train_sklearn_nb does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-04-12 14:24:26,336 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-04-12 14:24

In [10]:
predictor = model.deploy(initial_instance_count=1,
                         instance_type='ml.c4.xlarge')

-----------!

In [11]:
## Run Batch transform for predictions

In [12]:
test_path = 's3://sagemaker-eu-central-1-126514844793/udacityCapstone/data/vectorized_testdata.csv'

In [13]:
nb_transformer = model.transformer(1, 'ml.m4.xlarge', assemble_with = 'Line', accept = 'text/csv')

# start a transform job
nb_transformer.transform(test_path, split_type='Line', content_type='text/csv', input_filter='$[1:]', join_source='Input', output_filter='$[-1]')
nb_transformer.wait()

Using already existing model: sagemaker-scikit-learn-2020-04-12-14-22-06-627


...................[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: train-sklearn-nb
  Building wheel for train-sklearn-nb (setup.py): started
  Building wheel for train-sklearn-nb (setup.py): finished with status 'done'
  Created wheel for train-sklearn-nb: filename=train_sklearn_nb-1.0.0-py2.py3-none-any.whl size=7465 sha256=0b04b95dd70e46d84bd65bca0e949b3751e15023a62cbf1140d69b5050653a2e
  Stored in directory: /tmp/pip-ephem-wheel-cache-w5_r_tft/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3[0m
[34mSuccessfully built train-sklearn-nb[0m
[34mInstalling collected packages: train-sklearn-nb[0m
[34mSuccessfully installed train-sklearn-nb-1.0.0[0m
  import imp[0m
[34m[2020-04-12 14:34:57 +0000] [38] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-04-12 14:34:57 +0000] [38] [INFO] Listening at: unix:/tmp/gunicorn.sock (38)[0m
[34m[2020-04-12 14:34:57 +0000] [38] [INFO] Using worker: gevent[0m
[34m[2020-04-12 14:34:57 +0000] [41]

In [15]:
preds = pd.read_csv(f"s3://{nb_transformer.output_path}/vectorized_testdata.csv.out", header=None)

In [16]:
true_labels = pd.read_csv("s3://sagemaker-eu-central-1-126514844793/udacityCapstone/data/vectorized_testdata.csv", usecols=[0], header=None)

In [17]:
true_labels.head()

Unnamed: 0,0
0,0.0
1,1.0
2,0.0
3,1.0
4,1.0


In [18]:
preds.head()

Unnamed: 0,0
0,0.0
1,1.0
2,1.0
3,0.0
4,1.0


In [27]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [28]:
accuracy_score(true_labels,preds)

0.5739750445632799

In [29]:
#This model did not perform as I wished. 

In [31]:
recall_score(true_labels,preds)

0.5450284090909091