In [None]:
from backports import csv
import numpy as np
# Helps in reading long texts
csv.field_size_limit(sys.maxsize)


def get_texts_and_targets(filename):
    texts = []
    targets = []

    with io.open(filename, encoding='utf-8') as csvfile:
        readCSV = csv.reader(csvfile)
        for i, row in enumerate(readCSV):
            if i == 0:
                # Header row
                continue
            texts.append(row[1].strip().encode('ascii', 'replace'))
            targets.append(np.array([float(x) for x in row[2:]]))
    print("Total number of texts: %s" % len(texts))
    return texts, targets

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Max number of input words in any sample
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.1
def get_datasets(texts, targets, tokenizer=None):
    if tokenizer is None:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)

    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    targets = np.asarray(targets)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    targets = targets[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = targets[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = targets[-nb_validation_samples:]

    return tokenizer, word_index, x_train, y_train, x_val, y_val

In [None]:
from gensim.models import KeyedVectors
def load_glove_model():
    word2vec = KeyedVectors.load_word2vec_format(
            os.path.join(WORD2VEC_FOLDER,
                'word2vec_twitter_glove.txt'),
            binary=False)
    return word2vec

In [None]:
def get_embedding_layer(word_index, gensim_model):
    embedding_dim = len(gensim_model.wv['apple'])
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if word in gensim_model.wv.vocab:
            embedding_matrix[i] = gensim_model.wv[word]
    embedding_layer = Embedding(len(word_index) + 1,
            embedding_dim,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=True)
    return embedding_layer

In [None]:
from keras.models import Model
N_TARGET_CLASSES = 6
def get_convnet_model(embedding_layer):
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    preds = Dense(N_TARGET_CLASSES, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    return model


In [None]:
texts, targets = get_texts_and_targets('train.csv')
tokenizer, word_index, x_train, y_train, x_val, y_val = get_datasets(texts, targets)
word2vec = load_word2vec_model()
embedding_layer = get_embedding_layer(word_index, word2vec)
model = get_convnet_model(embedding_layer)
model.compile(loss='binary_crossentropy',
        optimizer='adagrad',
        metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=32, verbose=1)

score = model.evaluate(x_val, y_val,
                       batch_size = batch_size)

print('Test score:', score[0])
print('Test accuracy:', score[1])