In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display
import tensorflow as tf

## Neural Bigram
- Instead of counting, use neuron to model $p(w_t|w_{t-1})$
- 1 neuron = 1 logistic regression

### Word Representation
- Logistic Regression only work on number
- Must represent the words by vectors -> **One-hot encoding**. Eg:
    + 'a' = [1,0,0,0,...,0]
    + 'an' = [0,1,0,0,...,0]
    + ...
    + 'zoo' = [0,0,0,0,...,1]
    
### Logistic Regression Bigram
$$p(y|x) = softmax(W^Tx)$$
- x = last word
    + N*D matrix
    + D = # of input features
- y = current word
    + N*K matrix
    + K = # of output classes
- K = D = V: vocab size 

#### Softmax
$$\sigma(z)_j = \frac{e^{z_j}}{\sum_ke^{z_k}}$$

In [12]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# Example
z = np.array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0])
softmax(z)

array([0.02364054, 0.06426166, 0.1746813 , 0.474833  , 0.02364054,
       0.06426166, 0.1746813 ])

## Dataset - Brown Corpus

In [0]:
from nltk.corpus import brown
import operator

KEEP_WORDS = set([
    'king', 'man', 'queen', 'woman',
    'italy', 'rome', 'france', 'paris',
    'london', 'britain', 'england',
])

def get_sentences_with_word2idx_limit_vocab(n_vocab=2000, keep_words=KEEP_WORDS):
    # returns 57340 of the Brown corpus
    # each sentence is represented as a list of individual string tokens
    sentences = brown.sents()
    indexed_sentences = []

    i = 2
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']

    word_idx_count = {
        0: float('inf'),
        1: float('inf'),
    }

    for sentence in sentences:
        indexed_sentence = []
        for token in sentence:
            token = token.lower()
            if token not in word2idx:
                idx2word.append(token)
                word2idx[token] = i
                i += 1

            # keep track of counts for later sorting
            idx = word2idx[token]
            word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

            indexed_sentence.append(idx)
        indexed_sentences.append(indexed_sentence)

    # restrict vocab size

    # set all the words I want to keep to infinity
    # so that they are included when I pick the most
    # common words
    for word in keep_words:
        word_idx_count[word2idx[word]] = float('inf')

    sorted_word_idx_count = sorted(
        word_idx_count.items(),
        key=operator.itemgetter(1),
        reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]

        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1

    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx 
    unknown = new_idx

    assert('START' in word2idx_small)
    assert('END' in word2idx_small)
    for word in keep_words:
        assert(word in word2idx_small)

    # map old idx to new idx
    sentences_small = []
    for sentence in indexed_sentences:
        if len(sentence) > 1:
            new_sentence = [
                idx_new_idx_map[idx] if idx in idx_new_idx_map 
                else unknown 
                    for idx in sentence]
            sentences_small.append(new_sentence)

    return sentences_small, word2idx_small

#### Load dataset

In [4]:
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)

# vocab size
V = len(word2idx)

display(sentences[10])
print('Number of sentences:', len(sentences))
print('Vocab size:', V)

[28, 2000, 21, 13, 249, 26, 172, 893, 18, 2000, 27, 38, 315, 15]

Number of sentences: 57013
Vocab size: 2001


## Logistic Regression Model

#### Feature Engineering

In [5]:
# Create dataframe
data = pd.DataFrame()
data['sentences'] = sentences

# Add start, end char
start_idx = word2idx['START']
end_idx = word2idx['END']

data['sentences'] = data['sentences'].apply(
    lambda sentence: np.array([start_idx] + sentence + [end_idx]))

# One hot encoding
from keras.utils import to_categorical

data['sentences_cat'] = data['sentences'].apply(
    lambda sentence: to_categorical(sentence, num_classes=V))

# Trainning data
data['X'] = data['sentences_cat'].apply(
    lambda sentence: sentence[:len(sentence)-1])

data['y'] = data['sentences_cat'].apply(
    lambda sentence: sentence[1:])

# A sentence: 27 words, 1 word = one-hot encoded
display(data['X'][0].shape)
display(data['y'][0].shape)

Using TensorFlow backend.


(26, 2001)

(26, 2001)

In [0]:
X = data['X'].values
y = data['y'].values

In [2]:
# Variables
W_init = np.random.randn(V, V) / np.sqrt(V)
W = tf.Variable(W_init.astype(np.float32))

# Placeholders
X_ph = tf.placeholder(tf.float32)
y_ph = tf.placeholder(tf.float32)

# Graph
y_pred = tf.nn.softmax(tf.matmul(X_ph, W))

# Loss function
error = tf.reduce_sum(
    tf.square(y_ph - y_pred))

# Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)

# Train Function
train = optimizer.minimize(error)

NameError: name 'np' is not defined

In [11]:
# Init global variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    # Init variables
    sess.run(init)
    
    # Train
    epochs = 1
    for i in range(epochs):
        X_batch = X
        y_batch = y
        
        # train
        feed = {
            X_ph: X_batch,
            y_ph: y_batch}
        
        sess.run(train,feed_dict=feed)

ValueError: ignored