In [45]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [46]:
data = pd.read_csv('./data/crime_data_main.csv')

In [47]:
X_train, X_test, y_train, y_test = train_test_split(data['Preprocessed'], data['class'], test_size=0.20, random_state=1, stratify=data['class'])

In [48]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [49]:
X_train[0]

'rt mayasolov woman complain clean hous amp man alway take trash'

In [50]:
tokenizer.texts_to_sequences([X_train[0]])  

[[3, 1, 97, 580, 567, 218, 19, 32, 100, 65, 16]]

In [51]:
import pickle
with open('./models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
maxlen=50
def get_sequences(tokenizer, X_train):
    sequences = tokenizer.texts_to_sequences(X_train)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

In [53]:
padded_train_sequences = get_sequences(tokenizer, X_train)

In [54]:
padded_train_sequences[0]

array([   3, 9272, 4645,   23,  611, 1145,    9,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [55]:
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(10000, 16, input_length=maxlen),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [56]:
model = create_model()
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 16)            160000    
                                                                 
 bidirectional_8 (Bidirectio  (None, 50, 40)           5920      
 nal)                                                            
                                                                 
 bidirectional_9 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 64)                2624      
                                                                 
 dense_5 (Dense)             (None, 3)                 195       
                                                                 
Total params: 178,499
Trainable params: 178,499
Non-tr

In [57]:
import os

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                    save_weights_only=True,
                                                    verbose=1)


In [58]:
val_seq = get_sequences(tokenizer, X_train)
h = model.fit(
     padded_train_sequences, y_train,
     validation_data=(val_seq, y_train),
     epochs=5,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2), cp_callback]
)

Epoch 1/5
Epoch 1: saving model to training_1/cp.ckpt
Epoch 2/5
Epoch 2: saving model to training_1/cp.ckpt
Epoch 3/5
Epoch 3: saving model to training_1/cp.ckpt
Epoch 4/5
Epoch 4: saving model to training_1/cp.ckpt
Epoch 5/5
Epoch 5: saving model to training_1/cp.ckpt


In [59]:
model_rebuild = create_model()

In [60]:
loss, acc = model_rebuild.evaluate(get_sequences(tokenizer, X_test), y_test)
print("Untrained model, accuracy: {:5.2f}%".format(100*acc))

Untrained model, accuracy: 53.94%


In [61]:
model.load_weights(checkpoint_path)
loss,acc = model.evaluate(get_sequences(tokenizer, X_test), y_test)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 89.11%


In [62]:
model.save('models/tf_crime_model_m1.h5')

In [66]:
model.predict(get_sequences(tokenizer, ['You bitch']))



array([[5.9678070e-03, 9.9333823e-01, 6.9401984e-04]], dtype=float32)

In [65]:
model.predict(get_sequences(tokenizer, ['You suck']))



array([[0.34504184, 0.6052046 , 0.04975358]], dtype=float32)

In [63]:
np.argmax(model.predict(get_sequences(tokenizer, ['You suck'])))



1

In [64]:
np.argmax(model.predict(get_sequences(tokenizer, ['Good day'])))



2