In [1]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
import boto3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [2]:
s3 = boto3.resource('s3')
s3.Bucket('docclassification').download_file('datasets/data_hw.csv', 'datasets/data_hw.csv')

In [3]:
DATASET_NAME = "datasets/data_hw.csv"
CSV_COLUMNS=['topic', 'text']
df = pd.read_csv(
      DATASET_NAME,
      names=CSV_COLUMNS,
      skipinitialspace=True, 
      skiprows=1)

In [4]:
df.head()

Unnamed: 0,topic,text
0,DELETION OF INTEREST,e04a09c87692 d6b72e591b91 5d066f0246f1 ed41171...
1,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
2,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
3,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
4,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...


In [5]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [6]:
def clean_data(inputString):
    try:
        selVal = inputString.split(' ', 1)[0] 
        anyVal = any(char.isdigit() for char in selVal) 
        return anyVal
    except ValueError:
        return False

In [7]:
def preprocess(imdb_df):
    imdb_df = imdb_df[imdb_df.topic!=""]
    imdb_df = imdb_df[imdb_df['topic'].astype(str).apply(lambda x: clean_data(x)== False)]
    imdb_df = imdb_df[imdb_df['text'].astype(str).apply(lambda x: x != '')]
    return imdb_df

In [8]:
data = preprocess(df)
data.describe()

Unnamed: 0,topic,text
count,62204,62159
unique,14,60176
top,BILL,bf064c332aa1 079935e500e5 1a4dd36c6de0 7efa289...
freq,18968,11


In [9]:
topics = []
text = [] 
for i in range(len(data)):
    topics.append(str(data.iloc[i]['topic']).replace(" ", "").replace("-", ""))
    text.append(str(data.iloc[i]['text']))    

In [10]:
train_size = int(len(text) * training_portion)

train_text = text[0: train_size]
train_topics = topics[0: train_size]

validation_text = text[train_size:]
validation_topics = topics[train_size:]

print(train_size)
print(len(train_text))
print(len(train_topics))
print(len(validation_text))
print(len(validation_topics))

49763
49763
49763
12441
12441


In [11]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10])) 

465
200
403
200
597
200


In [12]:
validation_sequences = tokenizer.texts_to_sequences(validation_text)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

12441
(12441, 200)


In [13]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(topics)

training_topics_seq = np.array(label_tokenizer.texts_to_sequences(train_topics))
validation_topics_seq = np.array(label_tokenizer.texts_to_sequences(validation_topics))

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                             input_length=max_length),
    # specify the number of convolutions that you want to learn, their size, and their activation function.
    # words will be grouped into the size of the filter in this case 5
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(15, activation='softmax')
])
model.summary()



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 64)           320000    
_________________________________________________________________
conv1d (Conv1D)              (None, 196, 128)          41088     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 15)                975       
Total params: 370,

In [15]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
num_epochs = 5
history = model.fit(train_padded, training_topics_seq, epochs=num_epochs, validation_data=(validation_padded, validation_topics_seq), verbose=2)

Train on 49763 samples, validate on 12441 samples
Epoch 1/5
49763/49763 - 40s - loss: 0.8066 - acc: 0.7484 - val_loss: 0.5651 - val_acc: 0.8329
Epoch 2/5
49763/49763 - 40s - loss: 0.4894 - acc: 0.8485 - val_loss: 0.5187 - val_acc: 0.8421
Epoch 3/5
49763/49763 - 41s - loss: 0.4243 - acc: 0.8659 - val_loss: 0.5340 - val_acc: 0.8374
Epoch 4/5
49763/49763 - 40s - loss: 0.3832 - acc: 0.8782 - val_loss: 0.5148 - val_acc: 0.8454
Epoch 5/5
49763/49763 - 40s - loss: 0.3488 - acc: 0.8880 - val_loss: 0.5334 - val_acc: 0.8427


In [27]:
loss, accuracy = model.evaluate(train_padded, training_topics_seq, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 90.382415
