In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

from data_loader import load_tweets, load_pretrained_model
from plot_fit import plot_fit, visialize_model, save_history, plot_all_history


Using TensorFlow backend.


In [2]:
from settings import *

# MAX_SEQUENCE_LENGTH = 1000
# MAX_NUM_WORDS = 20000
# EMBEDDING_DIM = 100

# first, build index mapping words in the embeddings set
# to their embedding vector

# 1. load pretrained embedding
embedding_name = 'glove'
print('Indexing %s word vectors.' % embedding_name)


embeddings_index = {}
with open(glove_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.


In [3]:
# load data
(X_train, y_train), (X_val, y_val) = load_tweets()
y_train = to_categorical(np.asarray(y_train))
y_val = to_categorical(np.asarray(y_val))

# tokenize, filter punctuation, lowercase
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)

tokenizer.fit_on_texts(X_train)
vocarb_size = len(tokenizer.word_index) + 1
print("%d word types" % len(tokenizer.word_index))

# encoding method 0 : Tokenizer.texts_to_sequence
# ========================
train_seq = tokenizer.texts_to_sequences(X_train)
# print(len(encoded_text))

word_index = tokenizer.word_index

# print('index len:', len(word_index))
train_pad_seq = pad_sequences(sequences=train_seq, maxlen=MAX_SEQUENCE_LENGTH)

# pad val sequence
val_seq = tokenizer.texts_to_sequences(X_val)
val_pad_seq = pad_sequences(sequences=val_seq, maxlen=MAX_SEQUENCE_LENGTH)

# labels = to_categorical(np.asarray(y_train))
print("padding sequnce(X_input) shape:", train_pad_seq.shape)
# print("target(y_train) shape:", labels.shape)
print('-' * 80)

# Embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# subdir to save history
subdir = 'CNN2layer_%s' % embedding_name

dropout_rate = 0

# count DNN
# run_CNN_pretrianed_embedding(train_pad_seq, y_train, val_pad_seq, y_val, embedding_matrix, dropout_rate,
#                              'pretrained_{}_CNN_hid2_1dropout{}.pdf'.format(embedding_name,
#                                                                             dropout_rate), subdir)

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(5, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(train_pad_seq, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(val_pad_seq, y_val))



32023 word types
padding sequnce(X_input) shape: (35744, 200)
--------------------------------------------------------------------------------
Training model.


NameError: name 'labels_index' is not defined