In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import re
import collections
import random
import math

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_input = pd.read_csv('train.csv')
test_input = pd.read_csv('test.csv')
print "train_input shape is: ", np.shape(train_input)
print "test_input shape is: ", np.shape(test_input)

train_input shape is:  (4743, 2)
test_input shape is:  (1701, 2)


In [3]:
def label_tweet(input_set):
    handle = input_set['handle']
    # put it into an array named label, 
    # where 0 represents HillaryClinton, 
    # 1 represents readDonaldTrump
    label = []
    for i in range(len(handle)):
        if handle[i] == "HillaryClinton":
            label.append(0)
        if handle[i] == "realDonaldTrump":
            label.append(1)
    label = np.asarray(label)
    return label

train_label = label_tweet(train_input)

In [4]:
train_corpus = train_input['tweet'].as_matrix()
test_corpus = test_input['tweet'].as_matrix()

In [5]:
# Load the stopwords
stop_words = stopwords.words('english')
# 'https' seems useless, so I add it to stop_words
stop_words.append(u'https')

In [59]:
def tokenization(text):
#     return [i for i in text]
    tokens=[]
    for word in nltk.word_tokenize(text.decode('utf-8')):
        # skip all the websites, punctuations, pure digits
        if not re.match('[//]', word) and re.search('[a-zA-Z]', word) and word.lower() not in stop_words:
            tokens.append(word.lower())
#             tokens.extend([i for i in word.lower()])
    return tokens

# Tokenize training set
train_corpus_tokenized = []
for i in train_corpus:
    train_corpus_tokenized.append(' '.join(tokenization(i)))

# Tokenize testing set
test_corpus_tokenized = []
for i in test_corpus:
    test_corpus_tokenized.append(' '.join(tokenization(i)))

print "After tokenization, training set and testing set look like:"
print(train_corpus_tokenized[:5])
print(test_corpus_tokenized[:5])

After tokenization, training set and testing set look like:
[u'question election put plans action make life better', u'last night donald trump said paying taxes smart know call unpatriotic', u"stand together 's nothing ca n't make sure 're ready vote", u"candidates asked 'd confront racial injustice one real answer", u'join 3pm rally tomorrow mid-america center council bluffs iowa tickets']
[u"could n't proud hillaryclinton vision command last night 's debate showed 's ready next potus", u"election important sit go make sure 're registered nationalvoterregistrationday -h", u'government people join movement today', u"national voterregistrationday make sure 're registered vote makeamericagreatagain\u2026", u'great afternoon little havana hispanic community leaders thank support imwithyou']


In [60]:
train_tokenized_word = []
for i in range(len(train_corpus_tokenized)):
    train_tokenized_word.append(tf.compat.as_str(train_corpus_tokenized[i]).split())

test_tokenized_word = []
for i in range(len(test_corpus_tokenized)):
    test_tokenized_word.append(tf.compat.as_str(test_corpus_tokenized[i]).split())
    
print "After tf.compat.as_str, training set and testing set look like:"
print(train_tokenized_word[:5])
print(test_tokenized_word[:5])

After tf.compat.as_str, training set and testing set look like:
[['question', 'election', 'put', 'plans', 'action', 'make', 'life', 'better'], ['last', 'night', 'donald', 'trump', 'said', 'paying', 'taxes', 'smart', 'know', 'call', 'unpatriotic'], ['stand', 'together', "'s", 'nothing', 'ca', "n't", 'make', 'sure', "'re", 'ready', 'vote'], ['candidates', 'asked', "'d", 'confront', 'racial', 'injustice', 'one', 'real', 'answer'], ['join', '3pm', 'rally', 'tomorrow', 'mid-america', 'center', 'council', 'bluffs', 'iowa', 'tickets']]
[['could', "n't", 'proud', 'hillaryclinton', 'vision', 'command', 'last', 'night', "'s", 'debate', 'showed', "'s", 'ready', 'next', 'potus'], ['election', 'important', 'sit', 'go', 'make', 'sure', "'re", 'registered', 'nationalvoterregistrationday', '-h'], ['government', 'people', 'join', 'movement', 'today'], ['national', 'voterregistrationday', 'make', 'sure', "'re", 'registered', 'vote', 'makeamericagreatagain\xe2\x80\xa6'], ['great', 'afternoon', 'little', 

In [61]:
cnt = collections.Counter()
for i in range(len(train_tokenized_word)):
    for word in train_tokenized_word[i]:
        cnt[word] += 1

print 'Altogether there are: ' + str((len(cnt))) + ' words'

vocabulary_size = 10000

Altogether there are: 8507 words


In [62]:
def build_dataset(cnt, words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(cnt.most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    
    for i in range(len(words)):
        inner_data = []
        for word in words[i]:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            inner_data.append(index)
        data.append(inner_data)
        
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

train_x, count, dictionary, reverse_dictionary = build_dataset(cnt, train_tokenized_word, vocabulary_size)

In [63]:
# Process testing data
test = []
for sentence in test_tokenized_word:
    cur = []
    for word in sentence:
        if(word in dictionary):
            cur.append(dictionary[word])
        else:
            cur.append(0)
    # to store corresponding label
    test.append([cur,[0, 0]])

test_length = len(test)
print "Testing data set size is: " + str((len(test))) + ", such as:"
print(test[:5])

Testing data set size is: 1701, such as:
[[[92, 6, 123, 56, 1097, 0, 44, 83, 2, 101, 1224, 2, 200, 160, 32], [0, 0]], [[108, 271, 2004, 50, 12, 148, 76, 616, 1765, 81], [0, 0]], [[1276, 9, 43, 212, 27], [0, 0]], [[118, 0, 12, 148, 76, 616, 22, 0], [0, 0]], [[5, 933, 244, 0, 2910, 606, 517, 4, 66, 168], [0, 0]]]


In [64]:
# Process training data
train_all = [[train_x[i], [train_label[i], 1-train_label[i]]] for i in range(0, len(train_x))]

# shuffle the training set in which to pick training and validation sets
r_index = list(range(len(train_all)))
random.shuffle(r_index)
train = [train_all[i] for i in r_index[:int(len(r_index)*0.8)]]
valid = [train_all[i] for i in r_index[int(len(r_index)*0.8):]]

print "Training data set size is: " + str((len(train))) + ", such as:"
print(train[:5])
print "Testing data set size is: " + str((len(valid))) + ", such as:"
print(valid[:5])

Training data set size is: 3794, such as:
[[[8, 1382, 324, 65, 39, 838], [1, 0]], [[680, 67, 1771, 847, 1961, 9, 84, 371, 522, 24, 2, 236], [0, 1]], [[713, 163, 184, 1783, 45, 71, 213, 198, 2434, 11, 667], [0, 1]], [[6453, 2655, 170, 1953, 954, 1010, 5850, 16], [0, 1]], [[443, 1652, 320, 1, 20, 127, 23, 1902, 1152], [0, 1]]]
Testing data set size is: 949, such as:
[[[14, 507, 1078], [1, 0]], [[7, 1, 2, 775, 941, 86, 119, 256, 145, 775, 941, 761], [0, 1]], [[6691, 14], [1, 0]], [[331, 149, 179, 8, 103, 25, 19, 200, 7, 76, 508], [0, 1]], [[459, 3016, 190, 99, 34, 828, 308, 159, 457, 1, 821, 13, 1394, 167, 692], [1, 0]]]


In [65]:
class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        random.shuffle(self.df)
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor + n > self.size:
            self.epochs += 1
            print("SimpleDataIterator epoch : ", self.epochs)
            self.shuffle()
        res = self.df[self.cursor : self.cursor + n]
        self.cursor += n
        return res

# pad vectors in the same batch to keep their lengths identical
class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor + n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df[self.cursor : self.cursor + n]
        self.cursor += n

        # Pad sequences with 0s so they are all the same length
        max_len = 0
        for row in res:
            if len(row[0]) > max_len:
                max_len = len(row[0])
        seqlen = np.array([max_len for i in range(len(res))])
        ret = []
        label = []
        for row in res:
            ret += [row[0] + [0] * (max_len - len(row[0]))]
            label.append(row[1][0])
        x = np.array(ret)
        y = np.array(label)

        return x, y, seqlen

In [66]:
data = SimpleDataIterator(valid)
d = data.next_batch(500)
print 'Input sequences is like this:' 
print d[:5]

Input sequences is like this:
[[[37, 1413, 874, 2666, 1862, 7708, 1862, 1733, 693], [0, 1]], [[4, 120, 227, 498, 426, 528, 21, 1484], [1, 0]], [[1827, 735, 994, 3406, 7, 1, 67, 192, 202, 522, 522], [0, 1]], [[2, 54, 1644, 23, 1239, 23, 2, 30, 184, 58, 114, 7, 1], [0, 1]], [[2532, 2196, 229, 8313, 2178, 943, 235, 287, 826, 27, 60, 51, 6126, 81], [0, 1]]]


In [67]:
data = PaddedDataIterator(train)
d = data.next_batch(3)
print 'Input sequences in one random batch is now like:'
print d[0]
print 'with shape of: '
print d[0].shape
print 'Corresponding labels are: '
print d[1]
print 'where 0 stands for Hillary and 1 for Trump.'

Input sequences in one random batch is now like:
[[4668    8 8431    7   19   36  126   39   12   10    5    0    0    0]
 [  93 3210 2871  524  482 6282   32    2   80    0    0    0    0    0]
 [  53  366   11   12  516  983   18 1147   65  268  105  325 8368   16]]
with shape of: 
(3, 14)
Corresponding labels are: 
[1 1 0]
where 0 stands for Hillary and 1 for Trump.


In [68]:
# do the same thing for the testing set
def align(data):
    max_len = 0
    for row in data:
        if len(row[0]) > max_len:
            max_len = len(row[0])
    ret = []
    label = []
    for row in data:
        ret += [row[0] + [0]*(max_len - len(row[0]))]
        label.append(row[1][0])
    x = np.array(ret)
    y = np.array(label)
    seq_len = np.array([max_len for i in data])
    
    return x, y, seq_len

batch_size = 256
print 'test set length = %d' % test_length
print 'batch size = %d' % batch_size
print 'so there should be %d batches' % (test_length / batch_size + 1)
print ' '

test_list = []
test_addlen = test
test_addlen.extend(test[0:batch_size])
for i in range(test_length / batch_size + 1):
    x, y, seq_len = align(test[i * batch_size : (i+1) * batch_size])
    print 'testing batch ' + str(i + 1) + ' complete, with the vector length equals ' + str(x.shape[1])
    test_list.append([x, y, seq_len])

test set length = 1701
batch size = 256
so there should be 7 batches
 
testing batch 1 complete, with the vector length equals 19
testing batch 2 complete, with the vector length equals 18
testing batch 3 complete, with the vector length equals 23
testing batch 4 complete, with the vector length equals 20
testing batch 5 complete, with the vector length equals 20
testing batch 6 complete, with the vector length equals 18
testing batch 7 complete, with the vector length equals 19


In [82]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

# Build RNN model
def build_graph(
    vocab_size = len(dictionary),
    state_size = 32,
    batch_size = 256,
    num_classes = 2):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # We dont' have to add dropout
    # rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)

    # Obtain the last relevant output
    idx = tf.range(batch_size) * tf.shape(rnn_outputs)[1] + (seqlen - 1)
    last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)

    # finally use a Softmax layer to output a probability
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    
    # evaluate the model
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits))
   
    # optimizer, which could be tuned
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

In [83]:
def train_graph(g, batch_size = 256, num_epochs = 15, iterator = PaddedDataIterator):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        tv = iterator(valid)

        step, accuracy = 0, 0
        tr_losses, tv_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                # evaluate validation set
                tv_epoch = tv.epochs
                while tv.epochs == tv_epoch:
                    step += 1
                    batch = tv.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                tv_losses.append(accuracy / step)
                step, accuracy = 0,0
                print 'Accuracy after epoch %d is: ' % current_epoch 
                print 'training: %f, validation: %f' % (tr_losses[-1], tv_losses[-1])
        print '---------- RNN training done! ----------'
            
        # make predictions with the current model
        predictions = []
        for te in test_list:
            feed = {g['x']: te[0], g['y']: te[1], g['seqlen']: te[2]}
            preds_, _ = sess.run([g['preds'], g['ts']], feed_dict=feed)
            predictions.extend(preds_)

    return tr_losses, tv_losses, predictions

In [84]:
g = build_graph()
tr_losses, tv_losses, predictions = train_graph(g)

Accuracy after epoch 1 is: 
training: 0.509115, validation: 0.490234
Accuracy after epoch 2 is: 
training: 0.505301, validation: 0.486979
Accuracy after epoch 3 is: 
training: 0.505301, validation: 0.510417
Accuracy after epoch 4 is: 
training: 0.520368, validation: 0.496094
Accuracy after epoch 5 is: 
training: 0.532645, validation: 0.507812
Accuracy after epoch 6 is: 
training: 0.537946, validation: 0.566406
Accuracy after epoch 7 is: 
training: 0.648158, validation: 0.656250
Accuracy after epoch 8 is: 
training: 0.818359, validation: 0.789062
Accuracy after epoch 9 is: 
training: 0.907087, validation: 0.880208
Accuracy after epoch 10 is: 
training: 0.922991, validation: 0.871094
Accuracy after epoch 11 is: 
training: 0.933036, validation: 0.895833
Accuracy after epoch 12 is: 
training: 0.936663, validation: 0.890625
Accuracy after epoch 13 is: 
training: 0.945312, validation: 0.897135
Accuracy after epoch 14 is: 
training: 0.946429, validation: 0.912760
Accuracy after epoch 15 is: 


In [85]:
print 'Some predictions are like: '
print(predictions[:10])

Some predictions are like: 
[array([ 0.61423916,  0.38576075], dtype=float32), array([ 0.66434556,  0.33565447], dtype=float32), array([ 0.43157271,  0.56842732], dtype=float32), array([ 0.57348108,  0.42651895], dtype=float32), array([ 0.35469803,  0.64530194], dtype=float32), array([ 0.5702135,  0.4297865], dtype=float32), array([ 0.57474524,  0.4252547 ], dtype=float32), array([ 0.65039474,  0.34960526], dtype=float32), array([ 0.59814698,  0.40185308], dtype=float32), array([ 0.53202361,  0.46797639], dtype=float32)]


In [86]:
import csv
csvfile = file('csvtest11.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(['id', 'realDonaldTrump', 'HillaryClinton'])
data = []
for i in range(test_length):
    data.append((i, predictions[i][1], predictions[i][0]))

writer.writerows(data)
csvfile.close()