# Sentiment analysis using NLTK (Natural Language Toolkit) and Tensorflow

First we need to import all the necessary libraries.

In [None]:
import tensorflow as tf 
import numpy as np
import matplotlib as mpl
import tweepy
import random, csv
from nltk.tokenize import word_tokenize as wt
from unidecode import unidecode
import string
import os

## Connection with twitter

In [None]:
consumer_key = None
consumer_secret = None

auth = tweepy.OAuthHandler(consumer_key=consumer_key, consumer_secret=consumer_secret)
api = tweepy.API(auth)

Query recent tweets on montreal area within a range of 10 miles and put the results in a dataframe

In [None]:
results = []
#%23NBC as #NBC
#45.5191385,-73.6103499 is montreal geocode
for tweet in tweepy.Cursor(api.search, q='@nationalbank', result_type='recent', geocode='45.5191385,-73.6103499,10mi').items(5000):
    results.append(tweet)
resultDS = pd.DataFrame()

resultDS['userName'] = [tweet.user.name for tweet in results]
resultDS['tweetText'] = [tweet.text for tweet in results]
resultDS['tweetRetweetCt'] = [tweet.retweet_count for tweet in results]
resultDS['tweetCreated'] = [tweet.created_at for tweet in results]
resultDS['userLocation'] = [tweet.user.location for tweet in results]
resultDS['geo'] = [tweet.geo for tweet in results]
resultDS['place'] = [tweet.place for tweet in results]
resultDS['coordinates'] = [tweet.coordinates for tweet in results]
resultDS['userTimezone'] = [tweet.user.time_zone for tweet in results]    

## Start of data preparation 

In [None]:
# Our alphabet
emb_alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{} '

# we associate every character in our alphabet to a number: 
# e.g. b => 1 d => 3 etc.
DICT = {ch: ix for ix, ch in enumerate(emb_alphabet)}

# The size of our alphabet (~70)
ALPHABET_SIZE = len(emb_alphabet)

def reshape_lines(lines):
    # Hacky function to make it easier to process the data
    data = []
    for l in lines:
        # 
        split = l.split('","')
        data.append((split[0][1:], split[-1][:-2]))
    return data

def save_csv(out_file, data):
    # Save a file
    with open(out_file, 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(data)
    print('Data saved to file: %s' % out_file)
    

In [None]:
def shuffle_datasets(valid_perc=0.05):
    """ Shuffle the datasets """
    # TRAIN_SET and TEST_SET are respectively the path for 
    # training.1600000.processed.noemoticon.csv and
    # testdata.manual.2009.06.14.csv, this function will create two 
    # new files called "valid_set.csv" and "train_set.csv".
    
    # Make sure the paths exists, otherwise send some help messages...
    assert os.path.exists(TRAIN_SET), 'Download the training set at http://help.sentiment140.com/for-students/'
    assert os.path.exists(TEST_SET), 'Download the testing set at http://help.sentiment140.com/for-students/'

    # Create training and validation set - We take 5% of the training set 
    # for the validation set by default
    print('Creating training & validation set...')

    with open(TRAIN_SET, 'r') as f:
        lines = f.readlines()
        random.shuffle(lines)
        lines_train = lines[:int(len(lines) * (1 - valid_perc))]
        lines_valid = lines[int(len(lines) * (1 - valid_perc)):]

    save_csv(PATH + 'datasets/valid_set.csv', reshape_lines(lines_valid))
    save_csv(PATH + 'datasets/train_set.csv', reshape_lines(lines_train))

    print('Creating testing set...')

    with open(TEST_SET, 'r') as f:
        lines = f.readlines()
        random.shuffle(lines)
    save_csv(PATH + 'datasets/test_set.csv', reshape_lines(lines))
    print('All datasets have been created!')

In [None]:
# Once this is done, we rename the new training and testing set...
TRAIN_SET = PATH + 'datasets/train_set.csv'
TEST_SET = PATH + 'datasets/test_set.csv'
VALID_SET = PATH + 'datasets/valid_set.csv'

In [None]:
    def encode_one_hot(self, sentence):
        # Convert Sentences to np.array of Shape 
        # ('sent_length', 'word_length', 'emb_size')

        max_word_length = self.max_word_length
        sent = []
        
        # We need to keep track of the maximum length of the sentence in a minibatch
        # so that we can pad them with zeros, this is why we return the length of every
        # sentences after they are converted to one-hot tensors
        SENT_LENGTH = 0
        
        # Here, we remove any non-printable characters in a sentence (mostly
        # non-ASCII characters)
        printable = string.printable
        encoded_sentence = filter(lambda x: x in printable, sentence)
        
        # word_tokenize() splits a sentence into an array where each element is
        # a word in the sentence, for example, 
        # "My name is Charles" => ["My", "name", "is", Charles"]
        # Unidecode convert characters to utf-8
        for word in word_tokenize(unidecode(encoded_sentence)):
            
            # Encode one word as a matrix of shape [max_word_length x ALPHABET_SIZE]
            word_encoding = np.zeros(shape=(max_word_length, ALPHABET_SIZE))
            
            for i, char in enumerate(word):
            
                # If the character is not in the alphabet, ignore it    
                try:
                    char_encoding = DICT[char]
                    one_hot = np.zeros(ALPHABET_SIZE)
                    one_hot[char_encoding] = 1
                    word_encoding[i] = one_hot

                except Exception as e:
                    pass

            sent.append(np.array(word_encoding))
            SENT_LENGTH += 1

        return np.array(sent), SENT_LENGTH
    
    def make_minibatch(self, sentences):
        # Create a minibatch of sentences and convert sentiment
        # to a one-hot vector, also takes care of padding

        max_word_length = self.max_word_length
        minibatch_x = []
        minibatch_y = []
        max_length = 0
        
        for sentence in sentences:
            # Append the one-hot encoding of the sentiment to the minibatch of Y
            # 0: Negative 1: Positive
            minibatch_y.append(np.array([0, 1]) if sentence[:1] == '0' else np.array([1, 0]))

            # One-hot encoding of the sentence
            one_hot, length = self.encode_one_hot(sentence[2:-1])
            
            # Calculate maximum_sentence_length
            if length >= max_length:
                max_length = length
            
            # Append encoded sentence to the minibatch of X
            minibatch_x.append(one_hot)


        # data is a np.array of shape ('b', 's', 'w', 'e') we want to
        # pad it with np.zeros of shape ('e',) to get 
        # ('b', 'SENTENCE_MAX_LENGTH', 'WORD_MAX_LENGTH', 'e')
        
    def numpy_fillna(data):
            """ This is a very useful function that fill the holes in our tensor """
            
            # Get lengths of each row of data
            lens = np.array([len(i) for i in data])

            # Mask of valid places in each row
            mask = np.arange(lens.max()) < lens[:, None]

            # Setup output array and put elements from data into masked positions
            out = np.zeros(shape=(mask.shape + (max_word_length, ALPHABET_SIZE)),
                           dtype='float32')

            out[mask] = np.concatenate(data)
            return out

        # Padding...
        minibatch_x = numpy_fillna(minibatch_x)

        return minibatch_x, np.array(minibatch_y)

In [None]:
    def load_to_ram(self, batch_size):
        """ Load n Rows from File f to Ram """
        # Returns True if there are still lines in the buffer, 
        # otherwise returns false - the epoch is over
        
        self.data = []
        n_rows = batch_size
        while n_rows > 0:
            self.data.append(next(self.file))
            n_rows -= 1
        if n_rows == 0:
            return True
        else:
            return False

    def iterate_minibatch(self, batch_size, dataset=TRAIN_SET):
        """ Returns Next Batch """
        
        # I realize this could be more 
        if dataset == TRAIN_SET:
            n_samples = 1600000 * 0.95
        elif dataset == VALID_SET:
            n_samples = 1600000 * 0.05
        elif dataset == TEST_SET:
            n_samples = 498
        
        # Number of batches / number of iterations per epoch
        n_batch = int(n_samples // batch_size)
        
        # Creates a minibatch, loads it to RAM and feed it to the network
        # until the buffer is empty
        for i in range(n_batch):
            if self.load_to_ram(batch_size):
                inputs, targets = self.make_minibatch(self.data)
                yield inputs, targets

## Start of operational CNN

In [None]:
def conv2d(input_, output_dim, k_h, k_w, name="conv2d"):
    """ Straight-forward convvolutional layer """
    # w is the kernel, b the bias, no strides and VALID padding

    with tf.variable_scope(name):
        w = tf.get_variable('w', [k_h, k_w, input_.get_shape()[-1], output_dim])
        b = tf.get_variable('b', [output_dim])

    return tf.nn.conv2d(input_, w, strides=[1, 1, 1, 1], padding='VALID') + b

def tdnn(input_, kernels, kernel_features, scope='TDNN'):
    ''' Time Delay Neural Network
    :input:           input float tensor of shape 
                      [(batch_size*num_unroll_steps) x max_word_length x embed_size]
    :kernels:         array of kernel sizes
    :kernel_features: array of kernel feature sizes (parallel to kernels)
    '''
    assert len(kernels) == len(kernel_features), 'Kernel and Features must have the same size'

    # input_ is a np.array of shape ('b', 'sentence_length', 'max_word_length', 'embed_size') we
    # need to convert it to shape ('b * sentence_length', 1, 'max_word_length', 'embed_size') to
    # use conv2D
    # It might not seem obvious why we need to use this small hack at first sight, the reason
    # is that sentence_length will change across the different minibatches, but if we kept it
    # as is sentence_length would act as the number of channels in the convnet which NEEDS to
    # stay the same
    input_ = tf.reshape(input_, [-1, self.max_word_length, ALPHABET_SIZE])
    input_ = tf.expand_dims(input_, 1)

    layers = []
    with tf.variable_scope(scope):
        for kernel_size, kernel_feature_size in zip(kernels, kernel_features):
            reduced_length = self.max_word_length - kernel_size + 1

            # [batch_size * sentence_length x max_word_length x embed_size x kernel_feature_size]
            conv = conv2d(input_, kernel_feature_size, 1,
                          kernel_size, name="kernel_%d" % kernel_size)

            # [batch_size * sentence_length x 1 x 1 x kernel_feature_size]
            pool = tf.nn.max_pool(tf.tanh(conv), [1, 1, reduced_length, 1], [1, 1, 1, 1], 'VALID')

            layers.append(tf.squeeze(pool, [1, 2]))

        if len(kernels) > 1:
            output = tf.concat(layers, 1)
        else:
            output = layers[0]

    return output

def linear(input_, output_size, scope=None):
    """
    Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k]
    
    Args:
        args: a tensor or a list of 2D, batch x n, Tensors.
        output_size: int, second dimension of W[i].
        scope: VariableScope for the created subgraph; defaults to "Linear".
        
    Returns:
        A 2D Tensor with shape [batch x output_size] equal to
        sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
        
    Raises:
        ValueError: if some of the arguments has unspecified or wrong shape.
    """

    shape = input_.get_shape().as_list()
    if len(shape) != 2:
        raise ValueError("Linear is expecting 2D arguments: %s" % str(shape))
    if not shape[1]:
        raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape))
    input_size = shape[1]

    # Now the computation.
    with tf.variable_scope(scope or "SimpleLinear"):
        matrix = tf.get_variable("Matrix", [output_size, input_size],
                                 dtype=input_.dtype)
        bias_term = tf.get_variable("Bias", [output_size], dtype=input_.dtype)

    return tf.matmul(input_, tf.transpose(matrix)) + bias_term

# Improvements of the model

def highway(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu, scope='Highway'):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).
    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """

    with tf.variable_scope(scope):
        for idx in range(num_layers):
            g = f(linear(input_, size, scope='highway_lin_%d' % idx))

            t = tf.sigmoid(linear(input_, size, scope='highway_gate_%d' % idx) + bias)

            output = t * g + (1. - t) * input_
            input_ = output

    return output


In [None]:
hello=tf.constant('Hello,TensorFlow!')

sess=tf.Session()

print(sess.run(hello))