In [147]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import keras
import json

In [None]:
with open("categories.json","r") as fp:
    categories = json.load(fp)

### Encode event label

#### Each event has single label

In [137]:
with open("train_fake_singleLabel.json","r") as fp:
    train = json.load(fp)
x = [event['text'] for event in train]
y = [event['category'] for event in train]
print train_y
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_y)
print y

[[1 1]
 [1 0]]
[[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


#### Each event has multiple labels

In [129]:
with open("train_fake_multiLabel.json","r") as fp:
    train = json.load(fp)
x = [event['text'] for event in train]
y = [event['category'] for event in train]
print y
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
print y

[[u'concerts', u'art'], [u'art']]
[[1 1]
 [1 0]]


#### Tokenize event text

In [138]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
x = pad_sequences(sequences, maxlen=180, padding="post")

In [139]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [140]:
filter_length = 300
max_words = 5000
num_classes = len(y[0])
maxlen = 180

In [141]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_6 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 300)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 602       
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 118,902
Trainable params: 118,902
Non-trainable params: 0
_________________________________________________________________
Trai

In [148]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.644661068916
categorical_accuracy: 0.5
