In [1]:
# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3

import os

In [2]:
!pip install sagemaker==2.38.0

# Sagemaker
import sagemaker



In [3]:
X_train = pd.read_csv('../output/xtrain.csv', header = None)
X_test =  pd.read_csv('../output/xtest.csv', header = None)

y_train = pd.read_csv('../output/ytrain.csv')
y_test =  pd.read_csv('../output/ytest.csv')

In [4]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,44,1171,23,763,44,1415,454,1,23,2,...,0,0,0,0,0,0,0,0,0,0
1,166,166,13,166,166,22,433,414,5,1,...,0,0,0,0,0,0,0,0,0,0
2,21,42,33,16,12,120,1422,1578,401,195,...,0,0,0,0,0,0,0,0,0,0
3,1569,600,1,663,1562,142,320,172,28,6,...,0,0,0,0,0,0,0,0,0,0
4,11,238,44,330,633,1680,978,61,1,316,...,0,0,0,0,0,0,0,0,0,0


## 2. Upload the data to S3

In [5]:
data_dir = '../output/'

In [6]:
aws_data = pd.concat([y_train, X_train], axis=1)

aws_data.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [7]:
aws_data.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,0.0,7,93,819,1,269,201,1856,180,88,...,0,0,0,0,0,0,0,0,0,0
1,0.0,1368,122,1275,2,2577,758,186,391,93,...,0,0,0,0,0,0,0,0,0,0
2,0.0,994,894,637,849,2152,69,51,40,821,...,0,0,0,0,0,0,0,0,0,0
3,1.0,7,11,51,99,1,48,26,4,3,...,0,0,0,0,0,0,0,0,0,0
4,0.0,1512,1049,316,27,4,178,308,688,11,...,0,0,0,0,0,0,0,0,0,0


### 2.2. Uploading training data

In [8]:
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

In [9]:
data_dir = "../output"
prefix = 'positiveness-lyrics'

# upload all data to S3
data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(data)

s3://sagemaker-us-east-2-890904620905/positiveness-lyrics


In [10]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

positiveness-lyrics/train.csv
positiveness-lyrics/word_dict.pkl
positiveness-lyrics/xtest.csv
positiveness-lyrics/xtrain.csv
positiveness-lyrics/ytest.csv
positiveness-lyrics/ytrain.csv
sagemaker-scikit-learn-2021-04-26-21-54-29-931/source/sourcedir.tar.gz
sagemaker-scikit-learn-2021-04-26-21-55-14-070/source/sourcedir.tar.gz
sagemaker-scikit-learn-2021-04-26-22-14-28-777/source/sourcedir.tar.gz
Test passed!


### Train Base Model - naiveBayes

In [11]:
# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate our custom SKLearn estimator
estimator = SKLearn(entry_point='train_base_model.py',
                    source_dir='../utilis',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    output_path=output_path,
                    py_version='py3',
                    framework_version='0.23-1',
                    sagemaker_session=sagemaker_session,
                   
                   )

train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
%%time

# Train your estimator on S3 training data
estimator.fit({'train_base_model': data})

2021-04-26 22:21:06 Starting - Starting the training job...
2021-04-26 22:21:07 Starting - Launching requested ML instancesProfilerReport-1619475665: InProgress
......
2021-04-26 22:22:33 Starting - Preparing the instances for training...