In [1]:
import collections
import os

import matplotlib.pyplot as plt
import nltk
import numpy as np
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

Using TensorFlow backend.
  return f(*args, **kwds)


# Exploration

In [7]:
DATA_DIR = '../../data'

In [11]:
maxlen = 0
word_freqs = collections.Counter()
n_recs = 0
ftrain = open(os.path.join(DATA_DIR, 'umich-sentiment-train.txt'), 'r')
for line in ftrain:
    label, sentence = line.strip().split('\t')
    words = nltk.word_tokenize(sentence.lower())
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    n_recs += 1
ftrain.close()

In [12]:
print(maxlen)
print(len(word_freqs))

42
2328


In [13]:
MAX_FEATURES = 2000
MAX_SENTENCE_LEN = 40

In [14]:
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i + 2 
              for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index['PAD'] = 0
word2index['UNK'] = 1
index2word = {v: k for k, v in word2index.items()}

In [15]:
X = np.empty((n_recs, ), dtype=list)
y = np.zeros((n_recs,))
i = 0

In [17]:
ftrain = open(os.path.join(DATA_DIR, 'umich-sentiment-train.txt'), 'r')
for line in ftrain:
    label, sentence = line.strip().split('\t')
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index['UNK'])
    X[i] = seqs
    y[i] = int(label)
    i += 1
ftrain.close()

X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LEN)

In [18]:
X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   5,  10,   9,  12, 101,  17,  48,  22,
         4], dtype=int32)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11)