In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import keras
import json

Using TensorFlow backend.


### Load the category file

The category file contains all category classes we designed. Right now they are "concerts",
 "sports",
 "shows",
 "comedy",
 "art",
 "nightlife",
 "family",
 and "professional"

In [6]:
with open("categories.json","r") as fp:
    categories = json.load(fp)
print categories

[u'concerts', u'sports', u'shows', u'comedy', u'art', u'nightlife', u'family', u'professional', u'food&drink']


### Encode event label

#### Train with events with single label

In [17]:
with open("LabeledEvent.json","r") as fp:
    train = json.load(fp)
x = [event['description']+" "+event['title'] for event in train if 'description' in event and 'title' in event]
y = [event['primaryCategory'] for event in train if 'primaryCategory' in event]
print y
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_y)
print y

[u'Food&Drink', u'Professional', u'Food&Drink', u'Sports', u'Shows', u'Sports', u'Food&Drink', u'Food&Drink', u'Food&Drink', u'Professional', u'Family', u'Art', u'Professional', u'Nightlife', u'Sports', u'Food&Drink', u'Shows', u'Shows', u'Shows', u'Shows', u'Professional', u'Shows', u'Professional', u'Professional', u'Professional', u'Art', u'Professional', u'Nightlife', u'Shows']
[[0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0

#### Train with event with multiple labels

In [129]:
with open("train_fake_multiLabel.json","r") as fp:
    train = json.load(fp)
x = [event['text'] for event in train]
y = [event['category'] for event in train]
print y
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
print y

[[u'concerts', u'art'], [u'art']]
[[1 1]
 [1 0]]


#### Tokenize event text

In [18]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
x = pad_sequences(sequences, maxlen=180, padding="post")

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

### Build the model

In [21]:
filter_length = 300
max_words = 5000
num_classes = len(y[0])
maxlen = 180

In [22]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 2107      
_________________________________________________________________
activation_1 (Activat

In [23]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.544360339642
categorical_accuracy: 0.333333343267


In [27]:
ynew = cnn_model.predict(x_test)

In [28]:
ynew

array([[0.38057137, 0.37495065, 0.38566658, 0.36116973, 0.37529027,
        0.4082635 , 0.3800807 ],
       [0.3855164 , 0.37498653, 0.38796788, 0.36193812, 0.38027138,
        0.40841955, 0.38636413],
       [0.38745114, 0.37520587, 0.38530108, 0.36852673, 0.37690982,
        0.39931256, 0.37685555],
       [0.37637684, 0.37080765, 0.38281313, 0.35633585, 0.3722051 ,
        0.40155664, 0.37588236],
       [0.37531966, 0.36577433, 0.3820672 , 0.34968072, 0.36579782,
        0.39559516, 0.3784711 ],
       [0.39060193, 0.3876232 , 0.39640853, 0.37898305, 0.38716996,
        0.40427104, 0.38655668]], dtype=float32)

In [29]:
ynew.argmax(axis=-1)

array([5, 5, 5, 5, 5, 5])