In [297]:
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
nltk.download('punkt')
import sklearn as sk

WORD_VECTOR_PATH = "data/wordVectors.txt"
VOCAB_PATH = "data/vocab.txt"
DATA_PATH = "../data/primary_debates.csv"

[nltk_data] Downloading package punkt to /Users/ezshen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [298]:
def load_and_preprocess_sent(in_file):
    df = pd.read_csv(in_file, quotechar='"', delimiter=",")

    # filter out bad speakers
    df = df[(df.Speaker != 'AUDIENCE') & (df.Speaker != 'OTHER') & (df.Speaker != 'CANDIDATES') & (df.Speaker != 'QUESTION')]
    
    # split text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    df = pd.concat([pd.Series(row.Party, tokenizer.tokenize(row.Text)) for _, row in df.iterrows()]).reset_index()
    df = df.rename(index=str, columns={'index': 'Text', 0: 'Party'})
    
    for i, row in df.iterrows():
        if row.Text[-1] == '.': # get rid of periods and make lowercase
            row.Text = row.Text.lower()[:-1]
        else: 
            row.Text = row.Text.lower()
            
        if row.Party == 'Republican': # Democratic = 0, Republican = 1
            row.Party = 1 
        else: row.Party = 0 

    return df.Text.values.astype(str), df.Party.values.astype(int)

In [299]:
def load_and_preprocess_data(in_file):
    df = pd.read_csv(in_file, quotechar='"', delimiter=",")

    # filter out bad speakers
    df = df[(df.Speaker != 'AUDIENCE') & (df.Speaker != 'OTHER') & (df.Speaker != 'CANDIDATES') & (df.Speaker != 'QUESTION')]
    
    # split text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    df = pd.concat([pd.Series(row.Party, tokenizer.tokenize(row.Text)) for _, row in df.iterrows()]).reset_index()
    df = df.rename(index=str, columns={'index': 'Text', 0: 'Party'})
    
    for i, row in df.iterrows():
#         if row.Text[-1] == '.': # get rid of periods and make lowercase
#             row.Text = row.Text.lower()[:-1]
#         else: 
#             row.Text = row.Text.lower()
            
        if row.Party == 'Republican': # Democratic = 0, Republican = 1
            row.Party = 1 
        else: row.Party = 0 

    return df

In [300]:
x, y = load_and_preprocess_sent(DATA_PATH)

In [301]:
from string import punctuation
all_text = ' '.join(x)
words = all_text.split()

In [302]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

temp_sent_ints = []
for each in text:
    temp_sent_ints.append([vocab_to_int[word] for word in each.split()])

In [303]:
from collections import Counter
sent_lens = Counter([len(x) for x in temp_sent_ints])
seq_len = max(sent_lens) # set the sequence length
print("Zero-length sent: {}".format(sent_lens[0]))
print("Maximum sent length: {}".format(max(sent_lens)))

Zero-length sent: 12
Maximum sent length: 112


In [304]:
labels = []
sent_ints = []
for i in range(len(temp_sent_ints)):
    if len(temp_sent_ints[i]) > 0:
        sent_ints.append(temp_sent_ints[i][0:seq_len])
        labels.append(y[i])

In [305]:
from collections import Counter
sent_lens = Counter([len(x) for x in sent_ints])
print("Zero-length sent: {}".format(sent_lens[0]))
# set the max length to the longest sentence
print("Maximum sent length: {}".format(seq_len))

Zero-length sent: 0
Maximum sent length: 112


In [306]:
features = np.zeros((len(sent_ints), seq_len), dtype=int)
for i, row in enumerate(sent_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [307]:
print(len(features))
print(type(features))
print(features[41])
print(len(features[41]))
print(sent_ints[41])
print(len(sent_ints[41]))

36526
<type 'numpy.ndarray'>
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 65  9]
112
[65, 9]
2


In [308]:
len(labels)

36526

In [309]:
len(features)

36526

In [310]:
import sklearn as sk 
train_x, val_x, train_y, val_y = sk.model_selection.train_test_split(features, labels, test_size=0.3, random_state=224)
train_x = train_x[:1000]
train_y = train_y[:1000]
val_x = val_x[:1000]
val_y = val_y[:1000]

In [314]:
print("\t\t\tFeature Shapes:")
print("Train set: \t" + str(np.shape(train_x)) + "\nValidation set: \t" + str(np.shape(val_x)))
print("Label set: \t" + str(np.shape(train_y)) + "\nValidation label set: \t" + str(np.shape(val_y)))

			Feature Shapes:
Train set: 	(1000, 112)
Validation set: 	(1000, 112)
Label set: 	(1000,)
Validation label set: 	(1000,)


In [315]:
lstm_size = 256
lstm_layers = 2
batch_size = 1000
learning_rate = 0.01

In [316]:
n_words = len(vocab_to_int) + 1 # Add 1 for 0 added to vocab

# Create the graph object
tf.reset_default_graph()
with tf.name_scope('inputs'):
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")

In [317]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with tf.name_scope("Embeddings"):
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [318]:
def lstm_cell():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # Add dropout to the cell
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

with tf.name_scope("RNN_layers"):
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [319]:
with tf.name_scope("RNN_forward"):
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [320]:
with tf.name_scope('predictions'):
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    tf.summary.histogram('predictions', predictions)
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(labels_, predictions)
    tf.summary.scalar('cost', cost)

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

merged = tf.summary.merge_all()

In [321]:
with tf.name_scope('validation'):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [322]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [323]:
epochs = 10

# with graph.as_default():
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter('./logs/tb/train', sess.graph)
    test_writer = tf.summary.FileWriter('./logs/tb/test', sess.graph)
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: np.asarray(y).reshape(len(y), 1),
                    keep_prob: 0.5,
                    initial_state: state}
            summary, loss, state, _ = sess.run([merged, cost, final_state, optimizer], feed_dict=feed)
#             loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            train_writer.add_summary(summary, iteration)
        
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: np.asarray(y).reshape(len(y), 1),
                            keep_prob: 1,
                            initial_state: val_state}
#                     batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            test_writer.add_summary(summary, iteration)
            saver.save(sess, "checkpoints/classification_ethan.ckpt")
    saver.save(sess, "checkpoints/classification_ethan.ckpt")

('Epoch: 4/10', 'Iteration: 5', 'Train loss: 0.343')
('Epoch: 9/10', 'Iteration: 10', 'Train loss: 0.127')


In [325]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, "checkpoints/classification_ethan.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(val_x, val_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: np.asarray(y).reshape(len(y), 1),
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/classification_ethan.ckpt
Test accuracy: 0.524
