In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import re
import collections
import random
import math

In [2]:
pd.set_option('display.max_column', None)
train_input = pd.read_csv('train.csv')
test_input = pd.read_csv('test.csv')
print("train_input shape is: ", np.shape(train_input))
print("test_input shape is: ", np.shape(test_input))

('train_input shape is: ', (4743, 2))
('test_input shape is: ', (1701, 2))


In [3]:
def label_tweet(input_set):
    handle = input_set['handle']
    # put it into an array named label, where 0 represents HillaryClinton, 1 represents readDonaldTrump
    label = []
    for i in range(len(handle)):
        if handle[i] == "HillaryClinton":
            label.append(0)
        if handle[i] == "realDonaldTrump":
            label.append(1)
    label = np.asarray(label)
    return label

train_label = label_tweet(train_input)
print(train_label[:10])

[0 0 0 0 1 0 0 0 1 1]


In [4]:
train_corpus = train_input['tweet'].as_matrix()
print(np.shape(train_corpus))
print(train_corpus[:5])
test_corpus = test_input['tweet'].as_matrix()
print(np.shape(test_corpus))

(4743,)
[ 'The question in this election: Who can put the plans into action that will make your life better? https://t.co/XreEY9OicG'
 'Last night, Donald Trump said not paying taxes was "smart." You know what I call it? Unpatriotic. https://t.co/t0xmBfj7zF'
 "If we stand together, there's nothing we can't do. \n\nMake sure you're ready to vote: https://t.co/tTgeqxNqYm https://t.co/Q3Ymbb7UNy"
 "Both candidates were asked about how they'd confront racial injustice. Only one had a real answer. https://t.co/sjnEokckis"
 'Join me for a 3pm rally - tomorrow at the Mid-America Center in Council Bluffs, Iowa! Tickets:\xe2\x80\xa6 https://t.co/dfzsbICiXc']
(1701,)


In [5]:
# Load the stopwords
stop_words = stopwords.words('english')
# 'https' seems useless, so I add it to stop_words
stop_words.append(u'https')
print(stop_words)

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [7]:
# Tokenize
def tokenization(text):
    tokens=[]
    for word in nltk.word_tokenize(text.decode('utf-8')):
        # skip all the websites, punctuations, pure digits
        if not re.match('[//]', word) and re.search('[a-zA-Z]', word) and word.lower() not in stop_words:
            tokens.append(word.lower())
    return tokens

# Tokenize training set
train_corpus_tokenized = []
for i in train_corpus:
    train_corpus_tokenized.append(' '.join(tokenization(i)))
    
print(len(train_corpus_tokenized))
print(train_corpus_tokenized[:5])

# Tokenize testing set
test_corpus_tokenized = []
for i in test_corpus:
    test_corpus_tokenized.append(' '.join(tokenization(i)))

print(len(test_corpus_tokenized))
print(test_corpus_tokenized[:5])

4743
[u'question election put plans action make life better', u'last night donald trump said paying taxes smart know call unpatriotic', u"stand together 's nothing ca n't make sure 're ready vote", u"candidates asked 'd confront racial injustice one real answer", u'join 3pm rally tomorrow mid-america center council bluffs iowa tickets']
1701
[u"could n't proud hillaryclinton vision command last night 's debate showed 's ready next potus", u"election important sit go make sure 're registered nationalvoterregistrationday -h", u'government people join movement today', u"national voterregistrationday make sure 're registered vote makeamericagreatagain\u2026", u'great afternoon little havana hispanic community leaders thank support imwithyou']


In [8]:
train_tokenized_word = []
for i in range(len(train_corpus_tokenized)):
    train_tokenized_word.append(tf.compat.as_str(train_corpus_tokenized[i]).split())
print(len(train_tokenized_word))
print(train_tokenized_word[:5])

vocabulary_size = 10000

test_tokenized_word = []
for i in range(len(test_corpus_tokenized)):
    test_tokenized_word.append(tf.compat.as_str(test_corpus_tokenized[i]).split())
print(len(test_tokenized_word))
print(test_tokenized_word[:5])

4743
[['question', 'election', 'put', 'plans', 'action', 'make', 'life', 'better'], ['last', 'night', 'donald', 'trump', 'said', 'paying', 'taxes', 'smart', 'know', 'call', 'unpatriotic'], ['stand', 'together', "'s", 'nothing', 'ca', "n't", 'make', 'sure', "'re", 'ready', 'vote'], ['candidates', 'asked', "'d", 'confront', 'racial', 'injustice', 'one', 'real', 'answer'], ['join', '3pm', 'rally', 'tomorrow', 'mid-america', 'center', 'council', 'bluffs', 'iowa', 'tickets']]
1701
[['could', "n't", 'proud', 'hillaryclinton', 'vision', 'command', 'last', 'night', "'s", 'debate', 'showed', "'s", 'ready', 'next', 'potus'], ['election', 'important', 'sit', 'go', 'make', 'sure', "'re", 'registered', 'nationalvoterregistrationday', '-h'], ['government', 'people', 'join', 'movement', 'today'], ['national', 'voterregistrationday', 'make', 'sure', "'re", 'registered', 'vote', 'makeamericagreatagain\xe2\x80\xa6'], ['great', 'afternoon', 'little', 'havana', 'hispanic', 'community', 'leaders', 'thank',

In [9]:
cnt = collections.Counter()
for i in range(len(train_tokenized_word)):
    for word in train_tokenized_word[i]:
        cnt[word] += 1

print(len(cnt))

8507


In [10]:
def build_dataset(cnt, words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(cnt.most_common(n_words - 1))
#     print count[:20]
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    data_all = []
    unk_count = 0
    for i in range(len(words)):
        inner_data = []
        for word in words[i]:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            inner_data.append(index)
            data_all.append(index)
        data.append(inner_data)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, data_all, count, dictionary, reversed_dictionary

train_x, data_all, count, dictionary, reverse_dictionary = build_dataset(cnt, train_tokenized_word, vocabulary_size)

In [46]:
test = []
for sentence in test_tokenized_word:
    cur = []
    for word in sentence:
        if(word in dictionary):
            cur.append(dictionary[word])
        else:
            cur.append(0)
    test.append([cur,[0, 0]])

print(len(test))
print(test[:5])
test_length = len(test)
print(test_length)

1701
[[[92, 6, 123, 56, 1097, 0, 44, 83, 2, 101, 1224, 2, 200, 160, 32], [0, 0]], [[108, 271, 2004, 50, 12, 148, 76, 616, 1765, 81], [0, 0]], [[1276, 9, 43, 212, 27], [0, 0]], [[118, 0, 12, 148, 76, 616, 22, 0], [0, 0]], [[5, 933, 244, 0, 2910, 606, 517, 4, 66, 168], [0, 0]]]
1701


In [47]:
train_all = [[train_x[i], [train_label[i], 1-train_label[i]]] for i in range(0, len(train_x))]
# train_all = [[train_x[i], [train_label[i]]] for i in range(0, len(train_x))]

r_index = list(range(len(train_all)))
random.shuffle(r_index)
train = [train_all[i] for i in r_index[:int(len(r_index)*0.8)]]
valid = [train_all[i] for i in r_index[int(len(r_index)*0.8):]]

print(len(train))
print(train[:5])
print(len(valid))
print(valid[:5])
print(len(count))

3794
[[[7081, 316, 751, 686, 7795, 497, 1033, 1687, 5289, 182, 6696, 316, 2912, 106, 497, 1033, 6139, 500, 3575, 1261, 4295, 4510], [0, 1]], [[288, 1446, 34, 1283, 1225, 493, 1961, 9, 1771, 2, 2521], [0, 1]], [[7, 1, 37, 565, 73, 373, 321, 2, 7391, 2470, 514, 108, 6, 24], [0, 1]], [[344, 174, 2131, 40, 1602, 36, 362, 2290, 71, 1619, 613, 1896, 334, 3229, 535], [1, 0]], [[5, 579, 1038, 218, 179, 28, 3, 6884, 334, 132, 382], [1, 0]]]
949
[[[5203, 66, 3815, 50, 277, 150, 283, 3140], [1, 0]], [[3074, 105, 44, 202, 38, 3074, 11], [0, 1]], [[6734, 17, 118, 862, 2770, 735, 354, 7798, 143, 25, 1032, 6, 71], [1, 0]], [[1, 5271, 962, 1728, 689, 379, 1077, 67, 192, 985, 1026], [0, 1]], [[500, 6537, 2834, 2976, 1897, 4904, 1261, 592, 7115, 4004, 5268], [0, 1]]]
8508


In [48]:
class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        random.shuffle(self.df)
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epochs += 1
            print("SimpleDataIterator epoch : ", self.epochs)
            self.shuffle()
        res = self.df[self.cursor:self.cursor+n]
        self.cursor += n
        return res

In [49]:
data = SimpleDataIterator(valid)
d = data.next_batch(500)
print('Input sequences : ', d[:5])

('Input sequences : ', [[[1145, 127, 4974, 109, 2023, 222, 35, 940, 45, 12, 10, 5], [1, 0]], [[93, 823, 3735, 15, 2, 7165, 3075, 47, 7299, 979, 144, 2166, 19, 22, 458, 2488], [1, 0]], [[4], [0, 1]], [[8111, 316, 949, 1178, 3374, 1157, 1739, 1362, 1123], [0, 1]], [[2063, 1, 2, 5181, 959, 140, 14], [1, 0]]])


In [50]:
class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df[self.cursor:self.cursor+n]
        self.cursor += n

        # Pad sequences with 0s so they are all the same length
        max_len = 0
        for row in res:
            if len(row[0]) > max_len:
                max_len = len(row[0])
        seqlen = np.array([max_len for i in range(len(res))])
        ret = []
        label = []
        for row in res:
            ret += [row[0] + [0]*(max_len-len(row[0]))]
            label.append(row[1][0])
        x = np.array(ret)
        y = np.array(label)

        return x, y, seqlen

In [51]:
data = PaddedDataIterator(train)
d = data.next_batch(3)
print('Input sequences\n', d[0])
print(d[0].shape)
print(d[1])
print(d[1].shape)
print(d[2])

('Input sequences\n', array([[ 175, 8339,    1, 8315, 3315, 7494,   13, 3938, 6364,  110,   29,
          95,  432],
       [  60,  125,  434,  268,  438,   90,  370,    1, 1079,    0,    0,
           0,    0],
       [ 904,   39, 8277,  698,    3, 2730,   39, 2386, 2087,  681, 3689,
           0,    0]]))
(3, 13)
[1 0 0]
(3,)
[13 13 13]


In [52]:
batch_size = 256

In [53]:
def align(data):
    print(len(data))
    max_len = 0
    for row in data:
        if len(row[0]) > max_len:
            max_len = len(row[0])
    print(max_len)
    ret = []
    label = []
    for row in data:
        ret += [row[0] + [0]*(max_len-len(row[0]))]
        label.append(row[1][0])
    x = np.array(ret)
    y = np.array(label)
    seq_len = np.array([max_len for i in data])
    
    return x, y, seq_len

# def xx():
#     max_len = 0
#     for row in res:
#         if len(row[0]) > max_len:
#             max_len = len(row[0])
#     seqlen = np.array([max_len for i in range(len(res))])
#     ret = []
#     label = []
#     for row in res:
#         ret += [row[0] + [0]*(max_len-len(row[0]))]
#         label.append(row[1][0])
#     x = np.array(ret)
#     y = np.array(label)

# test_length = 1701
print(test_length)
print(batch_size)
print(test_length/batch_size)

test_list = []
test_addlen = test
test_addlen.extend(test[0:batch_size])
for i in range(test_length/batch_size+1):
    print(i)
    x, y, seq_len = align(test[i*batch_size:(i+1)*batch_size])
    print(x.shape, ", ", y.shape, ", ", seq_len.shape)
#     print(y.shape)
#     print(seq_len.shape)
    test_list.append([x, y, seq_len])

print("=====")
print(len(test_list[0]))
    

# test_align, max_len = align(test)
# print(len(test_align))

1701
256
6
0
256
19
((256, 19), ', ', (256,), ', ', (256,))
1
256
18
((256, 18), ', ', (256,), ', ', (256,))
2
256
23
((256, 23), ', ', (256,), ', ', (256,))
3
256
20
((256, 20), ', ', (256,), ', ', (256,))
4
256
20
((256, 20), ', ', (256,), ', ', (256,))
5
256
18
((256, 18), ', ', (256,), ', ', (256,))
6
256
19
((256, 19), ', ', (256,), ', ', (256,))
=====
3


In [54]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def build_graph(
    vocab_size = len(dictionary),
    state_size = 64,
    batch_size = 256,
    num_classes = 2):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])
#     keep_prob = tf.constant(1.0)
#     print("====",x.shape)
#     print("====",y.shape)
#     print("====",seqlen.shape)

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # Add dropout, as the model otherwise quickly overfits
#     rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)

    """
    Obtain the last relevant output. The best approach in the future will be to use:

        last_rnn_output = tf.gather_nd(rnn_outputs, tf.pack([tf.range(batch_size), seqlen-1], axis=1))

    which is the Tensorflow equivalent of numpy's rnn_outputs[range(30), seqlen-1, :], but the
    gradient for this op has not been implemented as of this writing.

    The below solution works, but throws a UserWarning re: the gradient.
    """
    idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
    last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)

    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
#     print(correct.shape)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
#     print(accuracy.shape)

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
#         'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

In [60]:
def train_graph(graph, batch_size = 256, num_epochs = 30, iterator = PaddedDataIterator):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        tv = iterator(valid)
#         te = iterator(test)

        step, accuracy = 0, 0
        tr_losses, tv_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
#             print("----",batch[0].shape)
#             print("----",batch[1].shape)
#             print("----",batch[2].shape)
#             dropout_parameter=np.array(0.6)
#             print("----",dropout_parameter.shape)
#             feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: dropout_parameter}
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                #eval test set
                tv_epoch = tv.epochs
                while tv.epochs == tv_epoch:
                    step += 1
                    batch = tv.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                tv_losses.append(accuracy / step)
                step, accuracy = 0,0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- tv:", tv_losses[-1])
            
        predictions = []
        for te in test_list:
            feed = {g['x']: te[0], g['y']: te[1], g['seqlen']: te[2]}
            preds_, _ = sess.run([g['preds'], g['ts']], feed_dict=feed)
            print(len(preds_))
            predictions.extend(preds_)

    return tr_losses, tv_losses, predictions

In [61]:
g = build_graph()
tr_losses, tv_losses, predictions = train_graph(g)
print(len(predictions))

('Accuracy after epoch', 1, ' - tr:', 0.50312500000000004, '- tv:', 0.5517578125)
('Accuracy after epoch', 2, ' - tr:', 0.537109375, '- tv:', 0.52864583333333337)
('Accuracy after epoch', 3, ' - tr:', 0.5421316964285714, '- tv:', 0.59765625)
('Accuracy after epoch', 4, ' - tr:', 0.66015625, '- tv:', 0.77604166666666663)
('Accuracy after epoch', 5, ' - tr:', 0.8797433035714286, '- tv:', 0.85286458333333337)
('Accuracy after epoch', 6, ' - tr:', 0.8869977678571429, '- tv:', 0.86067708333333337)
('Accuracy after epoch', 7, ' - tr:', 0.935546875, '- tv:', 0.87239583333333337)
('Accuracy after epoch', 8, ' - tr:', 0.9405691964285714, '- tv:', 0.90364583333333337)
('Accuracy after epoch', 9, ' - tr:', 0.9458705357142857, '- tv:', 0.88932291666666663)
('Accuracy after epoch', 10, ' - tr:', 0.9497767857142857, '- tv:', 0.89973958333333337)
('Accuracy after epoch', 11, ' - tr:', 0.9506138392857143, '- tv:', 0.91536458333333337)
('Accuracy after epoch', 12, ' - tr:', 0.9542410714285714, '- tv:',

In [62]:
# print(predictions[:test_length])
print(predictions[:10])

[array([ 0.99508613,  0.00491382], dtype=float32), array([ 0.99898177,  0.00101818], dtype=float32), array([  8.36639025e-04,   9.99163389e-01], dtype=float32), array([ 0.99675131,  0.00324868], dtype=float32), array([ 0.00122388,  0.99877614], dtype=float32), array([ 0.9981522 ,  0.00184773], dtype=float32), array([ 0.99545527,  0.00454476], dtype=float32), array([ 0.99864632,  0.00135366], dtype=float32), array([ 0.99890935,  0.0010906 ], dtype=float32), array([ 0.036767  ,  0.96323293], dtype=float32)]


In [71]:
import csv
csvfile = file('csvtest1.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(['id', 'realDonaldTrump', 'HillaryClinton'])
data = []
for i in range(test_length):
    data.append((i, predictions[i][0], predictions[i][1]))

# data = [
#   ('1', 'http://www.xiaoheiseo.com/', '小黑'),
#   ('2', 'http://www.baidu.com/', '百度'),
#   ('3', 'http://www.jd.com/', '京东')
# ]
writer.writerows(data)
csvfile.close()