In [1]:
import tensorflow as tf
import numpy as np
import os
import encoder, model
import json
import pandas as pd
import tqdm
import time
import matplotlib.pyplot as plt
import nltk.data
import scipy.special

In [6]:
# Config
models_dir = '../models'
model_name = '117M'
data_path = '~/Downloads/stanfordSentimentTreebank'
batch_size = 1
seed = 231654
max_len = 140
num_classes = 5

In [3]:
# Prepare model
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
data_path = os.path.expanduser(os.path.expandvars(data_path))
enc = encoder.get_encoder(model_name, models_dir)
hparams = model.default_hparams()
with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
    hparams.override_from_dict(json.load(f))

In [4]:
# Prepare dataset
dataset_path = os.path.join(data_path, 'datasetSentences.txt')
label_path = os.path.join(data_path, 'sentiment_labels.txt')
split_path = os.path.join(data_path, 'datasetSplit.txt')
dictionary_path = os.path.join(data_path, 'dictionary.txt')
sentences = pd.read_csv(dataset_path, sep='\t')
labels = pd.read_csv(label_path, sep='|')
splits = pd.read_csv(split_path, sep=',')
dictionary = {}
with open(dictionary_path) as fin:
    for line in fin.readlines():
        tokens = line.strip().split('|')
        dictionary[tokens[0]] = tokens[1]
sentences['phrase ids'] = sentences.apply(lambda x: int(dictionary.get(x['sentence'], 0)), axis=1)
dataset = sentences.merge(labels).merge(splits)

def score_sentiment(sentiment_score):
    if sentiment_score > 0.8:
        return 4
    elif sentiment_score > 0.6:
        return 3
    elif sentiment_score > 0.4:
        return 2
    elif sentiment_score > 0.2:
        return 1
    else:
        return 0

dataset['sentiment label'] = dataset.apply(lambda x: score_sentiment(x['sentiment values']), axis=1)

In [7]:
# Train model

def get_batch_generator(splitset_label, batch_size):
    def pad(encoding):
        padlen = max_len-len(encoding)
        encoding.extend([220]*padlen)
        return encoding
    
    
    while True:
        dataset_ = dataset[dataset['splitset_label']==splitset_label]
        random_subset = dataset_.sample(batch_size)
        X = random_subset['sentence'].apply(lambda x: pad(enc.encode(x)[:40])).tolist()
        y = random_subset['sentiment label'].to_numpy()
        yield X, y

def get_ordered_batch_generator(splitset_label, batch_size):
    def pad(encoding):
        padlen = max_len-len(encoding)
        encoding.extend([220]*padlen)
        return encoding
    dataset_ = dataset[dataset['splitset_label']==splitset_label]
    for i in range(0, dataset_.shape[0]-batch_size, batch_size):
        batch_ = dataset.iloc[i:i+batch_size]
        X = batch_['sentence'].apply(lambda x: pad(enc.encode(x)[:40])).tolist()
        y = batch_['sentiment label'].to_numpy()
        yield X, y
        
with tf.Session(graph=tf.Graph()) as sess:
    context = tf.placeholder(tf.int32, [batch_size, max_len])
    y = tf.placeholder(tf.int32, [batch_size])
    np.random.seed(seed)
    tf.set_random_seed(seed)
    
    # Model
    classifier = model.model(hparams=hparams, X=context, past=None, reuse=tf.AUTO_REUSE)
    
    with tf.variable_scope('classification_head', reuse=tf.AUTO_REUSE):
        logits = classifier['logits']
        dropout = tf.nn.dropout(logits, keep_prob=0.9)
        fc_2 = tf.layers.dense(dropout, 5)
        avg_logits = tf.math.reduce_mean(fc_2, axis=1)
        classifier['avg_logits'] = avg_logits
    
    # Cost
    cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=classifier['avg_logits']))
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    # Accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(
            tf.argmax(classifier['avg_logits'], 1, output_type = tf.int32), y
        ), tf.float32))
    
    init_all = tf.initializers.global_variables()
    sess.run(init_all)
    
    saver = tf.train.Saver([v for v in tf.trainable_variables() if 'model' in v.name])
    ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
    saver.restore(sess, ckpt)
    
    # Train
    start_time = time.time()
    random_batch_generator = get_batch_generator(1, batch_size)
    for i in range(1):
        X, Y = next(random_batch_generator)
        print ('X', X)
        print ('Y', Y)
        acc, cst, _ = sess.run([accuracy, cost, optimizer],feed_dict={context: X, y: Y})
        if(i % 100 == 0):
            print('iteration', i, 'accuracy', acc, 'cost', cst, 'running time', int(time.time()-start_time))
            
    # Test
    run_test = False
    if run_test:
        ordered_batch_generator = get_ordered_batch_generator(2, batch_size)
        accuracies = []
        costs = []
        while True:
            try:
                X, Y = next(ordered_batch_generator)
                acc, cst = sess.run([accuracy, cost], feed_dict={context: X, y: Y})
                accuracies.append(acc)
                costs.append(cst)
            except:
                break
        print("accuracy", np.mean(np.array(accuracies)))
        print("cost", np.mean(np.array(costs)))

    # Save model
    #saver = tf.train.Saver()
    #saver.save(sess, os.path.join(models_dir, model_name+"sa_140"))
    
    
    
    


X [[3646, 592, 588, 257, 2089, 13516, 286, 281, 625, 380, 431, 4471, 286, 3195, 705, 82, 32801, 705, 82, 13509, 290, 257, 32099, 290, 13526, 276, 12, 2902, 2196, 286, 5896, 8362, 764, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220]]
Y [0]
iteration 0 accuracy 0.0 cost 191.32114 running time 3


In [None]:
# Inference
def get_encoded_sentence(text):
    def pad(encoding):
        padlen = max_len-len(encoding)
        encoding.extend([220]*padlen)
        return encoding
    X = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for sentence in tokenizer.tokenize(text):
        encoding = enc.encode(sentence)
        X.append(pad(enc.encode(sentence)[:max_len])[:max_len])
    
    return X



with tf.Session() as sess:
    context = tf.placeholder(tf.int32, [1, max_len])
    # Model
    classifier = model.model(hparams=hparams, X=context, past=None, reuse=tf.AUTO_REUSE)
    saver = tf.train.Saver()
    saver.restore(sess, os.path.join(models_dir, model_name+"sa_140"))
    logits = classifier['avg_logits']
    scores = logits
    inp = get_encoded_sentence("I doubt this company will go anywhere but bankrupt.")
    tot_score = np.zeros(5)
    for sentence in inp:
        sentence_score = sess.run([scores],feed_dict={context: [sentence]})[0].reshape(-1)
        tot_score += sentence_score
        
    tot_score = scipy.special.softmax(tot_score)
    xs = np.arange(5)
    fig, ax = plt.subplots()
    plt.bar(xs, tot_score)
    plt.xticks(xs, ('Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'))
    plt.show()
    


In [None]:
len("I think adoption will be slow, and I think Tesla is the least efficient manufacturer and will get buried in the next few years.")