In [1]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
import boto3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [2]:
s3 = boto3.resource('s3')
s3.Bucket('docclassification').download_file('datasets/data_hw_sample.csv', 'datasets/data_hw_sample.csv')

NoCredentialsError: Unable to locate credentials

In [None]:
DATASET_NAME = "datasets/data_hw_sample.csv"
CSV_COLUMNS=['topic', 'text']
df = pd.read_csv(
      DATASET_NAME,
      names=CSV_COLUMNS,
      skipinitialspace=True, 
      skiprows=1)

In [None]:
df.head()

In [None]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [None]:
def clean_data(inputString):
    try:
        selVal = inputString.split(' ', 1)[0] 
        anyVal = any(char.isdigit() for char in selVal) 
        return anyVal
    except ValueError:
        return False

In [None]:
def preprocess(imdb_df):
    imdb_df = imdb_df[imdb_df.topic!=""]
    imdb_df = imdb_df[imdb_df['topic'].astype(str).apply(lambda x: clean_data(x)== False)]
    imdb_df = imdb_df[imdb_df['text'].astype(str).apply(lambda x: x != '')]
    return imdb_df

In [None]:
data = preprocess(df)
data.describe()

In [None]:
topics = []
text = [] 
for i in range(len(data)):
    topics.append(str(data.iloc[i]['topic']).replace(" ", "").replace("-", ""))
    text.append(str(data.iloc[i]['text']))    

In [None]:
train_size = int(len(text) * training_portion)

train_text = text[0: train_size]
train_topics = topics[0: train_size]

validation_text = text[train_size:]
validation_topics = topics[train_size:]

print(train_size)
print(len(train_text))
print(len(train_topics))
print(len(validation_text))
print(len(validation_topics))

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10])) 

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_text)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(topics)

training_topics_seq = np.array(label_tokenizer.texts_to_sequences(train_topics))
validation_topics_seq = np.array(label_tokenizer.texts_to_sequences(validation_topics))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                             input_length=max_length),
    # specify the number of convolutions that you want to learn, their size, and their activation function.
    # words will be grouped into the size of the filter in this case 5
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(14, activation='softmax')
])
model.summary()



In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 5
history = model.fit(train_padded, training_topics_seq, epochs=num_epochs, validation_data=(validation_padded, validation_topics_seq), verbose=2)

In [None]:
trainsize = int(len(validation_padded) * 0.7)
validsize = int(len(validation_padded) * 0.15)

df_train = validation_padded.iloc[:trainsize, :]
df_valid = validation_padded.iloc[trainsize:(trainsize+validsize), :]
df_test = validation_padded.iloc[(trainsize+validsize):, :]

In [None]:
df_train.to_csv('datasets/data-train.csv', index=False, header=False)
df_valid.to_csv('datasets/data-valid.csv', index=False, header=False)
df_test.to_csv('datasets/data-test.csv', index=False, header=False)

In [None]:
s3.Bucket('docclassification').upload_file('datasets/data-train.csv', 'datasets/data-train.csv')
s3.Bucket('docclassification').upload_file('datasets/data-valid.csv', 'datasets/data-valid.csv')
s3.Bucket('docclassification').upload_file('datasets/data-test.csv', 'datasets/data-test.csv')

In [None]:
from sagemaker import get_execution_role

#Bucket location to save your custom code in tar.gz format.
custom_code_upload_location = 's3://docclassification/customcode/tensorflow_doc'

#Bucket location where results of model training are saved.
model_artifacts_location = 's3://docclassification/artifacts'

#IAM execution role that gives SageMaker access to resources in your AWS account.
role = get_execution_role()

In [None]:
from sagemaker.tensorflow import TensorFlow

iris_estimator = TensorFlow(entry_point='sequential.py',
                            role=role,
                            output_path=model_artifacts_location,
                            code_location=custom_code_upload_location,
                            train_instance_count=1,
                            train_instance_type= 'ml.c4.xlarge', #'ml.c4.xlarge',
                            training_steps=1000,
                            evaluation_steps=100)

In [None]:
train_data_location = 's3://docclassification/datasets/'

In [None]:
iris_estimator.fit(train_data_location)

In [None]:
doc_predictor = iris_estimator.deploy(initial_instance_count=1,
                                       instance_type='ml.m4.xlarge' )