In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, MaxPooling1D, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, CSVLogger

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
import pickle

In [6]:

multilabel_binarizer = MultiLabelBinarizer()
model = None


In [24]:
filename = "../data/json_bundle_news_domestic_violence/clean_data.json"
df = pd.read_json(filename)
#df = self.clean_news(df)

y = df.about_domestic_violence.values
sentences = df['content'].values
sentences.shape

(7158,)

In [7]:
sentences_train, sentences_test, y_train, y_test = train_test_split(
sentences, y, test_size=0.25, random_state=1000)
maxlen = 900
with open('../data/neural_network_config/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [18]:
import keras_metrics
print("creating model")
filter_length = 300
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

#model = Sequential()
#model.add(Embedding(vocab_size, 20, input_length=maxlen))
#model.add(Dropout(0.15))
#model.add(GlobalMaxPool1D())
#model.add(Dense(output_size, activation='sigmoid'))

model = Sequential()
model.add(Embedding(vocab_size, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(1, activation='sigmoid'))

#self.model = Sequential()
#self.model.add(Embedding(vocab_size, 20, input_length=self.maxlen))
#self.model.add(GRU(128, return_sequences=True))
#self.model.add(GRU(128))
#self.model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy',keras_metrics.precision(), keras_metrics.recall()])


creating model


In [23]:

print("training model")
csv_logger = CSVLogger('log_loss.csv', append=False, separator=';')

callbacks = [
ModelCheckpoint(filepath='../data/neural_network_config/temp-model.h5', save_best_only=True), csv_logger]

history = model.fit(X_train, y_train,
                    epochs=40,
                    batch_size=100,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)


training model
Epoch 1/40
Instructions for updating:
`inputs` is now automatically inferred
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
print("saving model")
# serialize model to JSON
model_json = model.to_json()
with open("../data/neural_network_config/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../data/neural_network_config/model.h5")
print("Saved model to disk")