# Perception Difference Among NLP Models



In [1]:
import sys
!{sys.executable} -m pip install dynet



In [2]:
from collections import defaultdict
import time
import random

import dynet_config
import dynet as dy

import numpy as np

In [3]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"] # create an index for the unknown token
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

In [4]:
# Read in the data
train_set = list(read_dataset("dist_semantics_lecture_movie_review_data/train.txt"))
w2i = defaultdict(lambda: UNK, w2i) # put unknown index on the front
dev_set = list(read_dataset("dist_semantics_lecture_movie_review_data/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [5]:
nwords, ntags

(18648, 5)

In [6]:
len(train_set), len(dev_set)

(8544, 2210)

In [7]:
nwords

18648

In [8]:
len(w2i),w2i # words to indices

(18648,
 defaultdict(<function __main__.<lambda>()>,
             {'<unk>': 0,
              'the': 1,
              'rock': 2,
              'is': 3,
              'destined': 4,
              'to': 5,
              'be': 6,
              '21st': 7,
              'century': 8,
              "'s": 9,
              'new': 10,
              '``': 11,
              'conan': 12,
              "''": 13,
              'and': 14,
              'that': 15,
              'he': 16,
              'going': 17,
              'make': 18,
              'a': 19,
              'splash': 20,
              'even': 21,
              'greater': 22,
              'than': 23,
              'arnold': 24,
              'schwarzenegger': 25,
              ',': 26,
              'jean-claud': 27,
              'van': 28,
              'damme': 29,
              'or': 30,
              'steven': 31,
              'segal': 32,
              '.': 33,
              'gorgeously': 34,
              'elaborate': 35,
  

In [9]:
t2i # tags to indices

defaultdict(<function __main__.<lambda>()>,
            {'3': 0, '4': 1, '2': 2, '1': 3, '0': 4})

In [10]:
i2w = {v:k for k,v in w2i.items()}
i2w[0] = "<unk>"
i2w

{0: '<unk>',
 1: 'the',
 2: 'rock',
 3: 'is',
 4: 'destined',
 5: 'to',
 6: 'be',
 7: '21st',
 8: 'century',
 9: "'s",
 10: 'new',
 11: '``',
 12: 'conan',
 13: "''",
 14: 'and',
 15: 'that',
 16: 'he',
 17: 'going',
 18: 'make',
 19: 'a',
 20: 'splash',
 21: 'even',
 22: 'greater',
 23: 'than',
 24: 'arnold',
 25: 'schwarzenegger',
 26: ',',
 27: 'jean-claud',
 28: 'van',
 29: 'damme',
 30: 'or',
 31: 'steven',
 32: 'segal',
 33: '.',
 34: 'gorgeously',
 35: 'elaborate',
 36: 'continuation',
 37: 'of',
 38: 'lord',
 39: 'rings',
 40: 'trilogy',
 41: 'so',
 42: 'huge',
 43: 'column',
 44: 'words',
 45: 'can',
 46: 'not',
 47: 'adequately',
 48: 'describe',
 49: 'co-writer\\/director',
 50: 'peter',
 51: 'jackson',
 52: 'expanded',
 53: 'vision',
 54: 'j.r.r.',
 55: 'tolkien',
 56: 'middle-earth',
 57: 'singer\\/composer',
 58: 'bryan',
 59: 'adams',
 60: 'contributes',
 61: 'slew',
 62: 'songs',
 63: '--',
 64: 'few',
 65: 'potential',
 66: 'hits',
 67: 'more',
 68: 'simply',
 69: 'int

In [11]:
i2t = {v:k for k,v in t2i.items()}
i2t

{0: '3', 1: '4', 2: '2', 3: '1', 4: '0'}

In [12]:
def decode_sample(sample):
    return [i2w[x] if x in i2w else "<unk>" for x in sample[0]], i2t[sample[1]]

In [13]:
def get_num_tags(dev):
    zeroes = 0
    ones = 0
    twos = 0
    threes = 0
    fours = 0
    
    for i in range(len(dev)):
        _, tag = decode_sample(dev[i])

        if tag == '0':
            zeroes += 1
        elif tag == '1':
            ones += 1
        elif tag == '2':
            twos += 1
        elif tag =='3':
            threes += 1
        elif tag =='4':
            fours += 1
        
    if (zeroes + ones + twos + threes + fours) != len(dev):
        return 0
    else:
        return zeroes, ones, twos, threes, fours

In [14]:
def calc_scores(words, W_sm, b_sm):
    dy.renew_cg()
    score = dy.esum([dy.lookup(W_sm, x) for x in words])
    b_sm_exp = dy.parameter(b_sm)
    return score + b_sm_exp

In [15]:
def correct_and_predicted(dev, W_sm, b_sm):
    correct = []
    predicted = []

    for i in range(len(dev)):
        _, tag = decode_sample(dev[i])

        scores = calc_scores(dev[i][0], W_sm, b_sm).npvalue()
        predict = np.argmax(scores)

        correct.append(tag)
        predicted.append(i2t[predict])

    return correct, predicted


The distance away calculation works for this classification rating task, but different measures would be needed for other datasets.

In [16]:
def statistics(correct_list, predicted_list):
    
    absolute_distance = 0
    distance = 0
    total_distance = 0
    
    pos_overattribution = 0
    neg_overattribution = 0
    
    num_correct = 0
    num_wrong_positive = 0
    num_wrong_negative = 0
    
    num_correct = 0
    num_wrong = 0
    
    num_values = len(correct_list)
    
    for i in range(num_values):
        absolute_distance += np.abs(int(predicted_list[i]) - int(correct_list[i]))
        distance = int(predicted_list[i]) - int(correct_list[i])
        total_distance += distance
        
        if distance > 0:
            pos_overattribution += distance
            num_wrong_positive += 1
            num_wrong += 1
        elif distance < 0:
            neg_overattribution += distance
            num_wrong_negative += 1
            num_wrong += 1
        else:
            num_correct += 1
            
    average_absolute_distance = absolute_distance / num_values
    average_distance = total_distance / num_values
    average_wrong_positive = pos_overattribution / num_wrong_positive
    average_wrong_negative = neg_overattribution / num_wrong_negative
    percent_correct = (num_correct / num_values)*100
    percent_wrong = (num_wrong / num_values)*100
    
    print("Average distance away from correct rating across all values, incorrect and correct: ", average_absolute_distance)
    print("Average distance away overall among all predictions: ", average_distance)
    print("When wrong, the model was overly positive by: ", average_wrong_positive)
    print("When wrong, the model was overly negative by: ", average_wrong_negative)
    print("Percent correct: ", percent_correct)
    print("Percent wrong: ", percent_wrong)
    
    return average_absolute_distance, average_distance, average_wrong_positive, average_wrong_negative, percent_correct, percent_wrong

In [52]:
def mean_of_skew(skewed_data):
    total = 0
    for review in skewed_data:
        total += int(i2t[review[1]])
    return total / len(skewed_data)

In [None]:
def median_of_skew(skewed_data):
    total = 0
    for review in skewed_data:
        total += int(i2t[review[1]])
    return total / len(skewed_data)

In [17]:
ends_in_0 = [zero for zero in train_set if i2t[zero[1]] == '0']
ends_in_1 = [one for one in train_set if i2t[one[1]] == '1']
ends_in_2 = [two for two in train_set if i2t[two[1]] == '2']
ends_in_3 = [three for three in train_set if i2t[three[1]] == '3']
ends_in_4 = [four for four in train_set if i2t[four[1]] == '4']

In [18]:
back_together = ends_in_0 + ends_in_1 + ends_in_2 + ends_in_3 + ends_in_4
len(back_together)

8544

In [19]:
ends_in_0_dev = [zero for zero in dev_set if i2t[zero[1]] == '0']
ends_in_1_dev = [one for one in dev_set if i2t[one[1]] == '1']
ends_in_2_dev = [two for two in dev_set if i2t[two[1]] == '2']
ends_in_3_dev = [three for three in dev_set if i2t[three[1]] == '3']
ends_in_4_dev = [four for four in dev_set if i2t[four[1]] == '4']

In [20]:
back_together_dev = ends_in_0_dev + ends_in_1_dev + ends_in_2_dev + ends_in_3_dev + ends_in_4_dev
len(back_together_dev)

2210

In [21]:
words, tag = decode_sample(train_set[10])
" ".join(words), tag

('good fun , good action , good acting , good dialogue , good pace , good cinematography .',
 '4')

In [22]:
get_num_tags(train_set)

(1092, 2218, 1624, 2322, 1288)

In [23]:
get_num_tags(dev_set)

(279, 633, 389, 510, 399)

In [24]:
len(ends_in_0), len(ends_in_1), len(ends_in_2), len(ends_in_3), len(ends_in_4)

(1092, 2218, 1624, 2322, 1288)

In [25]:
len(ends_in_0_dev), len(ends_in_1_dev), len(ends_in_2_dev), len(ends_in_3_dev), len(ends_in_4_dev)

(279, 633, 389, 510, 399)

In [26]:
def run_train_function(train, dev, learning_rates, num_epochs):
    statistics_per_run = {}

    for num_epoch in num_epochs:
        for learning_rate in learning_rates:
            model = dy.Model()
            trainer = dy.AdamTrainer(model, learning_rate)

            W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights
            b_sm = model.add_parameters((ntags))                # Softmax bias

            dynet_config.set(random_seed=0)
            dy.renew_cg();


            for ITER in range(num_epoch):
                # Perform training
                random.shuffle(train)
                train_loss = 0.0
                start = time.time()
                for words, tag in train:
                    my_loss = dy.pickneglogsoftmax(calc_scores(words, W_sm, b_sm), tag) # negative softmax log likelihood
                    train_loss += my_loss.value()
                    my_loss.backward()
                    trainer.update()
                if ITER % int(num_epoch/10) == 0:
                    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
                # Perform testing
                test_correct = 0.0
                for words, tag in dev:
                    scores = calc_scores(words, W_sm, b_sm).npvalue()
                    predict = np.argmax(scores)
                    if predict == tag:
                        test_correct += 1
                if ITER % int(num_epoch/10) == 0:
                    print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))

            correct, predicted = correct_and_predicted(dev, W_sm, b_sm)
            average_absolute_distance, average_distance, average_wrong_positive, average_wrong_negative, percent_correct, percent_wrong = statistics(correct, predicted)
            statistics_per_run[(learning_rate, num_epoch)] = (average_absolute_distance, average_distance, average_wrong_positive, average_wrong_negative, percent_correct, percent_wrong)
    
    return statistics_per_run


best run; .0005 learning rate and 100 epochs

In [27]:
# learning_rates = [.0001, .0005, .001, .0015]
# num_epochs = [50, 100, 150, 200]

# learning_rates = [.0005, .001]
learning_rates = [.0005]
num_epochs = [100]

In [28]:
run_train_function(train_set, dev_set, learning_rates, num_epochs)

The dy.parameter(...) call is now DEPRECATED.
        There is no longer need to explicitly add parameters to the computation graph.
        Any used parameter will be added automatically.
iter 0: train loss/sent=1.5327, time=0.31s
iter 0: test acc=0.3271
iter 10: train loss/sent=1.0750, time=0.30s
iter 10: test acc=0.4018
iter 20: train loss/sent=0.9040, time=0.32s
iter 20: test acc=0.4104
iter 30: train loss/sent=0.7984, time=0.30s
iter 30: test acc=0.4149
iter 40: train loss/sent=0.7238, time=0.28s
iter 40: test acc=0.4290
iter 50: train loss/sent=0.6677, time=0.31s
iter 50: test acc=0.4122
iter 60: train loss/sent=0.6227, time=0.27s
iter 60: test acc=0.4140
iter 70: train loss/sent=0.5852, time=0.30s
iter 70: test acc=0.4050
iter 80: train loss/sent=0.5537, time=0.30s
iter 80: test acc=0.4104
iter 90: train loss/sent=0.5261, time=0.30s
iter 90: test acc=0.4167
Average distance away from correct rating across all values, incorrect and correct:  0.8257918552036199
Average distance aw

{(0.0005, 100): (0.8257918552036199,
  0.06380090497737556,
  1.4287790697674418,
  -1.3472,
  40.588235294117645,
  59.411764705882355)}

In [29]:
baseline = ends_in_0[:1000] + ends_in_1[:1000] + ends_in_2[:1000] + ends_in_3[:1000] + ends_in_4[:1000]

In [30]:
baseline_dev = ends_in_0_dev[:250] + ends_in_1_dev[:250] + ends_in_2_dev[:250] + ends_in_3_dev[:250] + ends_in_4_dev[:250]

In [31]:
skew_positive = ends_in_0[:800] + ends_in_1[:850] + ends_in_2[:900] + ends_in_3[:950] + ends_in_4[:1000]
skew_positive2 = ends_in_0[:600] + ends_in_1[:700] + ends_in_2[:800] + ends_in_3[:900] + ends_in_4[:1000]
skew_positive3 = ends_in_0[:400] + ends_in_1[:550] + ends_in_2[:700] + ends_in_3[:850] + ends_in_4[:1000]
skew_positive4 = ends_in_0[:200] + ends_in_1[:400] + ends_in_2[:600] + ends_in_3[:800] + ends_in_4[:1000]
no_0s = ends_in_1[:1000] + ends_in_2[:1000] + ends_in_3[:1000] + ends_in_4[:1000]

skew_negative = ends_in_0[:1000] + ends_in_1[:950] + ends_in_2[:900] + ends_in_3[:850] + ends_in_4[:800]
skew_negative2 = ends_in_0[:1000] + ends_in_1[:900] + ends_in_2[:800] + ends_in_3[:700] + ends_in_4[:600]
skew_negative3 = ends_in_0[:1000] + ends_in_1[:850] + ends_in_2[:700] + ends_in_3[:550] + ends_in_4[:400]
skew_negative4 = ends_in_0[:1000] + ends_in_1[:800] + ends_in_2[:600] + ends_in_3[:400] + ends_in_4[:200]
no_4s = ends_in_0[:1000] + ends_in_1[:1000] + ends_in_2[:1000] + ends_in_3[:1000]

inverted = ends_in_0[:900] + ends_in_1[:950] + ends_in_2[:1000] + ends_in_3[:950] + ends_in_4[:900]
inverted2 = ends_in_0[:800] + ends_in_1[:900] + ends_in_2[:1000] + ends_in_3[:900] + ends_in_4[:800]
inverted3 = ends_in_0[:700] + ends_in_1[:850] + ends_in_2[:1000] + ends_in_3[:850] + ends_in_4[:700]
inverted4 = ends_in_0[:600] + ends_in_1[:800] + ends_in_2[:1000] + ends_in_3[:800] + ends_in_4[:600]
inverted5 = ends_in_0[:500] + ends_in_1[:750] + ends_in_2[:1000] + ends_in_3[:750] + ends_in_4[:500]

Run the next 2 cells to reset the lists

In [32]:
skews = []

skews.append(baseline)

skews.append(skew_positive)
skews.append(skew_positive2)
skews.append(skew_positive3)
skews.append(skew_positive4)
skews.append(no_0s)


skews.append(skew_negative)
skews.append(skew_negative2)
skews.append(skew_negative3)
skews.append(skew_negative4)
skews.append(no_4s)

skews.append(inverted)
skews.append(inverted2)
skews.append(inverted3)
skews.append(inverted4)
skews.append(inverted5)

In [53]:
for skew in skews:
    print(mean_of_skew(skew))

2.0
2.111111111111111
2.25
2.4285714285714284
2.6666666666666665
2.5
1.8888888888888888
1.75
1.5714285714285714
1.3333333333333333
1.5
2.0
2.0
2.0
2.0
2.0


In [45]:
baseline[0][1]

2

In [44]:
baseline[:2]

[([10058,
   152,
   46,
   1410,
   1,
   172,
   14,
   1062,
   1171,
   171,
   285,
   37,
   1,
   361,
   9286,
   433,
   37,
   10059,
   10060,
   9,
   4587,
   33],
  2),
 ([132,
   9,
   1706,
   37,
   3925,
   149,
   26,
   71,
   279,
   2231,
   146,
   15,
   837,
   3108,
   526,
   3926,
   5,
   461,
   3927,
   3928,
   37,
   1,
   754,
   3929,
   33],
  0)]

In [33]:
all_run_statistics = []

for skew in skews:
    stats_on_run = run_train_function(skew, baseline_dev, learning_rates, num_epochs)
    all_run_statistics.append(stats_on_run)

iter 0: train loss/sent=1.5834, time=0.19s
iter 0: test acc=0.3064
iter 10: train loss/sent=1.0656, time=0.17s
iter 10: test acc=0.3784
iter 20: train loss/sent=0.8650, time=0.17s
iter 20: test acc=0.3848
iter 30: train loss/sent=0.7437, time=0.17s
iter 30: test acc=0.3904
iter 40: train loss/sent=0.6601, time=0.17s
iter 40: test acc=0.4024
iter 50: train loss/sent=0.5978, time=0.19s
iter 50: test acc=0.4040
iter 60: train loss/sent=0.5488, time=0.19s
iter 60: test acc=0.4000
iter 70: train loss/sent=0.5089, time=0.19s
iter 70: test acc=0.4032
iter 80: train loss/sent=0.4753, time=0.18s
iter 80: test acc=0.3960
iter 90: train loss/sent=0.4466, time=0.20s
iter 90: test acc=0.4008
Average distance away from correct rating across all values, incorrect and correct:  0.9056
Average distance away overall among all predictions:  -0.0048
When wrong, the model was overly positive by:  1.4699738903394255
When wrong, the model was overly negative by:  -1.5420054200542006
Percent correct:  39.8399

iter 10: train loss/sent=1.0297, time=0.12s
iter 10: test acc=0.3304
iter 20: train loss/sent=0.8169, time=0.12s
iter 20: test acc=0.3384
iter 30: train loss/sent=0.6908, time=0.13s
iter 30: test acc=0.3360
iter 40: train loss/sent=0.6051, time=0.13s
iter 40: test acc=0.3376
iter 50: train loss/sent=0.5420, time=0.13s
iter 50: test acc=0.3416
iter 60: train loss/sent=0.4932, time=0.12s
iter 60: test acc=0.3456
iter 70: train loss/sent=0.4538, time=0.14s
iter 70: test acc=0.3432
iter 80: train loss/sent=0.4209, time=0.13s
iter 80: test acc=0.3488
iter 90: train loss/sent=0.3932, time=0.13s
iter 90: test acc=0.3480
Average distance away from correct rating across all values, incorrect and correct:  1.0512
Average distance away overall among all predictions:  -0.456
When wrong, the model was overly positive by:  1.3726937269372694
When wrong, the model was overly negative by:  -1.7444444444444445
Percent correct:  35.120000000000005
Percent wrong:  64.88000000000001
iter 0: train loss/sen

In [34]:
all_run_statistics

[{(0.0005, 100): (0.9056,
   -0.0048,
   1.4699738903394255,
   -1.5420054200542006,
   39.839999999999996,
   60.160000000000004)},
 {(0.0005, 100): (0.9504,
   0.1536,
   1.571753986332574,
   -1.5045317220543806,
   38.4,
   61.6)},
 {(0.0005, 100): (0.992,
   0.3376,
   1.7028688524590163,
   -1.4250871080139373,
   38.0,
   62.0)},
 {(0.0005, 100): (1.064,
   0.5136,
   1.7638640429338104,
   -1.3870967741935485,
   35.44,
   64.56)},
 {(0.0005, 100): (1.2072,
   0.804,
   1.9045454545454545,
   -1.2989690721649485,
   31.680000000000003,
   68.32000000000001)},
 {(0.0005, 100): (0.9648,
   0.3536,
   1.5344506517690875,
   -1.3840579710144927,
   34.96,
   65.03999999999999)},
 {(0.0005, 100): (0.9184,
   -0.0752,
   1.4398907103825136,
   -1.544776119402985,
   38.56,
   61.44)},
 {(0.0005, 100): (0.984,
   -0.2032,
   1.4144927536231884,
   -1.6525612472160356,
   36.480000000000004,
   63.519999999999996)},
 {(0.0005, 100): (1.0512,
   -0.456,
   1.3726937269372694,
   -1.7444