In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import model_from_json
import keras
import json

Using TensorFlow backend.


### Load the category file

The category file contains all category classes we designed. Right now they are "concerts",
 "sports",
 "shows",
 "comedy",
 "art",
 "nightlife",
 "family",
 ,"professional", and "Food&Drink"

In [30]:
with open("categories.json","r") as fp:
    categories = json.load(fp)
print categories

[u'Concerts', u'Sports', u'Shows', u'Comedy', u'Art', u'Nightlife', u'Family', u'Professional', u'Food&Drink']


### Encode event label

#### Train with events with single label

In [31]:
encoder.classes_

array([u'Art', u'Comedy', u'Concerts', u'Family', u'Food&Drink',
       u'Nightlife', u'Professional', u'Shows', u'Sports'], dtype='<U12')

In [33]:
with open("LabeledEvent.json","r") as fp:
    train = json.load(fp)
x = [event['description']+" "+event['title'] for event in train if 'description' in event and 'title' in event]
y = [event['primaryCategory'] for event in train if 'primaryCategory' in event]
print y
encoder = LabelEncoder()
encoder.fit(categories)
encoded_y = encoder.transform(y)
print encoded_y
# convert integers to dummy variables (i.e. one hot encoded)
y = np_utils.to_categorical(encoded_y)
print y

[u'Food&Drink', u'Professional', u'Food&Drink', u'Sports', u'Shows', u'Sports', u'Food&Drink', u'Food&Drink', u'Food&Drink', u'Professional', u'Family', u'Art', u'Professional', u'Nightlife', u'Sports', u'Food&Drink', u'Shows', u'Shows', u'Shows', u'Shows', u'Professional', u'Shows', u'Professional', u'Professional', u'Professional', u'Art', u'Professional', u'Nightlife', u'Shows']
[4 6 4 8 7 8 4 4 4 6 3 0 6 5 8 4 7 7 7 7 6 7 6 6 6 0 6 5 7]
[[0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0

#### Train with event with multiple labels

In [12]:
# with open("train_fake_multiLabel.json","r") as fp:
#     train = json.load(fp)
# x = [event['text'] for event in train]
# y = [event['category'] for event in train]
# print y
# mlb = MultiLabelBinarizer()
# y = mlb.fit_transform(y)
# print y

#### Tokenize event text

In [13]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)
x = pad_sequences(sequences, maxlen=180, padding="post")

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

### Build the model

In [15]:
filter_length = 300
max_words = 5000
num_classes = len(y[0])
maxlen = 180

In [16]:
model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 2709      
_________________________________________________________________
activation_1 (Activat

In [17]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
metrics = model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.528395414352
categorical_accuracy: 0.166666671634


In [18]:
ynew = model.predict(x_test)

In [19]:
ynew

array([[0.37227684, 0.39408594, 0.33480525, 0.36833304, 0.38900763,
        0.38610333, 0.42702228, 0.41262126, 0.38348436],
       [0.37336844, 0.40004545, 0.34451497, 0.37249756, 0.39626682,
        0.38671583, 0.42649543, 0.41762152, 0.3903542 ],
       [0.36213008, 0.39145258, 0.32841083, 0.35693842, 0.38263917,
        0.3783453 , 0.4223026 , 0.41239458, 0.38675642],
       [0.36308464, 0.39057487, 0.32815558, 0.3556372 , 0.38613915,
        0.37761945, 0.4187616 , 0.4105157 , 0.383396  ],
       [0.36181006, 0.39086872, 0.32526773, 0.36091882, 0.3870321 ,
        0.3809524 , 0.4201624 , 0.4061485 , 0.38327914],
       [0.36510363, 0.40174896, 0.33261853, 0.36529005, 0.3907317 ,
        0.38393298, 0.42947716, 0.41755944, 0.39168057]], dtype=float32)

In [26]:
y_pred = (-ynew).argsort(axis=-1)[:,:3] 

In [21]:
y_test.argmax(axis=-1)

array([7, 4, 4, 4, 6, 7])

In [22]:
model.predict_classes(x_test)

array([6, 6, 6, 6, 6, 6])

In [27]:
[encoder.inverse_transform(y) for y in y_pred]

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


[array([u'Professional', u'Shows', u'Comedy'], dtype='<U12'),
 array([u'Professional', u'Shows', u'Comedy'], dtype='<U12'),
 array([u'Professional', u'Shows', u'Comedy'], dtype='<U12'),
 array([u'Professional', u'Shows', u'Comedy'], dtype='<U12'),
 array([u'Professional', u'Shows', u'Comedy'], dtype='<U12'),
 array([u'Professional', u'Shows', u'Comedy'], dtype='<U12')]

In [29]:
import sklearn
sklearn.__version__

'0.18.1'