In [13]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [14]:
data = pd.read_csv('./data/crime_data_main.csv')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data['Preprocessed'], data['class'], test_size=0.20, random_state=1, stratify=data['class'])

In [16]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [17]:
X_train[0]

'rt mayasolov woman complain clean hous amp man alway take trash'

In [18]:
tokenizer.texts_to_sequences([X_train[0]])

[[3, 1, 97, 580, 567, 218, 19, 32, 100, 65, 16]]

In [19]:
maxlen=50
def get_sequences(tokenizer, X_train):
    sequences = tokenizer.texts_to_sequences(X_train)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

In [20]:
padded_train_sequences = get_sequences(tokenizer, X_train)

In [21]:
padded_train_sequences[0]

array([   3, 9272, 4645,   23,  611, 1145,    9,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [25]:
def create_model():
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(10000,16,input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
    tf.keras.layers.Dense(6, activation='softmax')
    ])
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [26]:
model = create_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 16)            160000    
                                                                 
 bidirectional_4 (Bidirectio  (None, 50, 40)           5920      
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 6)                 246       
                                                                 
Total params: 175,926
Trainable params: 175,926
Non-trainable params: 0
_________________________________________________________________


In [23]:
import os

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                    save_weights_only=True,
                                                    verbose=1)


In [24]:
val_seq = get_sequences(tokenizer, X_train)
h = model.fit(
     padded_train_sequences, y_train,
     validation_data=(val_seq, y_train),
     epochs=5,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2), cp_callback]
)

Epoch 1/5
Epoch 1: saving model to training_1/cp.ckpt
Epoch 2/5
Epoch 2: saving model to training_1/cp.ckpt
Epoch 3/5
Epoch 3: saving model to training_1/cp.ckpt
Epoch 4/5
Epoch 4: saving model to training_1/cp.ckpt
Epoch 5/5
Epoch 5: saving model to training_1/cp.ckpt


In [27]:
model_rebuild = create_model()

In [28]:
loss, acc = model_rebuild.evaluate(get_sequences(tokenizer, X_test), y_test)
print("Untrained model, accuracy: {:5.2f}%".format(100*acc))

Untrained model, accuracy: 53.80%


In [29]:
model.load_weights(checkpoint_path)
loss,acc = model.evaluate(get_sequences(tokenizer, X_test), y_test)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 87.71%
