In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import model_from_json
import keras
import json

Using TensorFlow backend.


### Load the category file

The category file contains all category classes we designed. Right now they are "concerts",
 "sports",
 "shows",
 "comedy",
 "art",
 "nightlife",
 "family",
 and "professional"

In [37]:
with open("categories.json","r") as fp:
    categories = json.load(fp)
print categories

[u'Concerts', u'Sports', u'Shows', u'Comedy', u'Art', u'Nightlife', u'Family', u'Professional', u'Food&Drink']


### Encode event label

#### Train with events with single label

In [50]:
encoder.classes_

array([u'Art', u'Comedy', u'Concerts', u'Family', u'Food&Drink',
       u'Nightlife', u'Professional', u'Shows', u'Sports'], dtype='<U12')

In [57]:
encoder.inverse_transform([0,1,2,3,4,5,6])

  if diff:


u'Art'

In [40]:
with open("LabeledEvent.json","r") as fp:
    train = json.load(fp)
x = [event['description']+" "+event['title'] for event in train if 'description' in event and 'title' in event]
y = [event['primaryCategory'] for event in train if 'primaryCategory' in event]
print y
encoder = LabelEncoder()
encoder.fit(categories)
encoded_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_y)
print y

[u'Food&Drink', u'Professional', u'Food&Drink', u'Sports', u'Shows', u'Sports', u'Food&Drink', u'Food&Drink', u'Food&Drink', u'Professional', u'Family', u'Art', u'Professional', u'Nightlife', u'Sports', u'Food&Drink', u'Shows', u'Shows', u'Shows', u'Shows', u'Professional', u'Shows', u'Professional', u'Professional', u'Professional', u'Art', u'Professional', u'Nightlife', u'Shows']
[[0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0

#### Train with event with multiple labels

In [39]:
# with open("train_fake_multiLabel.json","r") as fp:
#     train = json.load(fp)
# x = [event['text'] for event in train]
# y = [event['category'] for event in train]
# print y
# mlb = MultiLabelBinarizer()
# y = mlb.fit_transform(y)
# print y

[[u'concerts', u'art'], [u'art']]
[[1 1]
 [1 0]]


#### Tokenize event text

In [41]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
x = pad_sequences(sequences, maxlen=180, padding="post")

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

### Build the model

In [43]:
filter_length = 300
max_words = 5000
num_classes = len(y[0])
maxlen = 180

In [44]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_2 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 2709      
_________________________________________________________________
activation_2 (Activation)    (None, 9)                 0         
Total params: 121,009
Trainable params: 121,009
Non-trainable params: 0
_________________________________________________________________
Trai

In [58]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
metrics = model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.543176949024
categorical_accuracy: 0.333333343267


In [59]:
ynew = model.predict(x_test)

In [60]:
ynew

array([[0.3857464 , 0.37086114, 0.39092505, 0.3832836 , 0.4091908 ,
        0.38083246, 0.40811855, 0.4004858 , 0.40568805],
       [0.3952547 , 0.37878373, 0.39688212, 0.39079547, 0.4161353 ,
        0.3832267 , 0.4098271 , 0.40636238, 0.40636778],
       [0.38985646, 0.38089186, 0.38993537, 0.39470503, 0.40452087,
        0.38208658, 0.41737264, 0.40877762, 0.40727186],
       [0.3871193 , 0.37060234, 0.391263  , 0.383628  , 0.40789235,
        0.37278503, 0.40518707, 0.39926824, 0.40043935],
       [0.37926745, 0.3645649 , 0.38463914, 0.38096705, 0.40286848,
        0.37256235, 0.40008447, 0.39397794, 0.3967126 ],
       [0.3915453 , 0.38214314, 0.3936375 , 0.39702645, 0.40503457,
        0.3832752 , 0.41460523, 0.40787268, 0.40843552]], dtype=float32)

In [29]:
ynew.argmax(axis=-1)

array([5, 5, 5, 5, 5, 5])

In [33]:
y_test.argmax(axis=-1)

array([5, 2, 2, 2, 4, 5])