In [1]:
import pandas as pd

df = pd.read_csv("affectivetext_data.csv")
labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "valence"]
df = df.reindex(columns=["headline", *labels])

# note that quotation marks around the entire string are lost in `DataFrame.read_csv`
# Instances of double double quotes (""_"") are reduced to a single pair of double quotes ("_")

# df.ndim = 2
# df.shape = (1250, 8)
n = df.shape[0] # = 1250 data points in total

df.iloc[[0,1,34,44,248,292,308,981,998,999,1000,1001,1248,1249],:]

Unnamed: 0,headline,anger,disgust,fear,joy,sadness,surprise,valence
0,Mortar assault leaves at least 18 dead,22,2,60,0,64,0,-98
1,Goal delight for Sheva,0,0,0,93,0,38,87
34,The sweet tune of an anniversary,0,0,0,83,0,0,65
44,US Troops Killed In October In Iraq,46,39,41,0,77,0,-79
248,Man admits UK-US terror bomb plot,33,43,61,0,33,0,-55
292,Federer handed tough Aussie draw,0,0,0,4,8,4,4
308,Ganguly handed India squad call-up,4,4,6,31,5,12,6
981,Toyota's Scion parks in Second Life,0,0,0,17,4,18,23
998,Weekly Nielsen will 'Lost' find its way back?,0,0,11,17,9,11,21
999,Brith Airways baggage charges cause confusion,5,13,4,0,14,18,-36


In [2]:
import numpy as np
import torch
import string

headlines = pd.Series.tolist(df["headline"])

tabs = [headline.count('\t') for headline in headlines]
# print([i for i, e in enumerate(tabs) if e != 0])

def surr_quotes(s):
    if s[0] == "'" and s[-1] == "'":
        return 1
    return 0
quoted = [surr_quotes(headline) for headline in headlines]
# print([i for i, e in enumerate(quoted) if e != 0])

# replace any tab characters with spaces
headlines = [headline.replace('\t',' ') for headline in headlines]
# replace single instance of grave accent (used incorrectly) with single quote
headlines = [headline.replace("`","'") for headline in headlines]
# remove single quote surrounding headlines[248] (this is the only instance of a single-quoted headline)
headlines[248] = headlines[248][1:-1]
# downcase letters
headlines1 = [headline.lower() for headline in headlines]

# list of all characters in headlines ordered by ascii value
char_list = list(set("".join(headlines1)))
char_list.sort()
# number of distinct characters in the headlines
num_distinct_chars = len(char_list)
# the maximum number of characters in headlines
max_chars = max([len(headline) for headline in headlines1])

print(f"{char_list = }\n{num_distinct_chars = }\n{max_chars = }")

print()

# replace any hyphens with spaces so that less information is lost
headlines2 = [headline.replace('-',' ') for headline in headlines1]
# obtain list of list of words in each headline
headlines_words = [headline.split() for headline in headlines2]
# rejoin word lists to remove extraneous spaces
headlines2 = [' '.join(headline) for headline in headlines_words]
# remove punctuation from headlines
exclude = string.punctuation
headlines2 = [''.join(ch for ch in headline if ch not in exclude) for headline in headlines2]
char_list2 = list(set("".join(headlines2)))
char_list2.sort()
# obtain list of list of words in each headline
headlines_words = [headline.split() for headline in headlines2]
# number of distinct words in the headlines
num_distinct_words = len(set([word for words in headlines_words for word in words]))
# the maximum number of words in headlines
max_words = max([len(headline) for headline in headlines_words])
words_per_headline = np.array([len(words) for words in headlines_words])
mean_words_per_headline = np.mean(words_per_headline)

print(f"{char_list2 = }\n{num_distinct_words = }\n{max_words = }\n{mean_words_per_headline = }")
np.quantile(words_per_headline,.25), np.quantile(words_per_headline,.5), np.quantile(words_per_headline,.75)

char_list = [' ', '!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']
num_distinct_chars = 53
max_chars = 88

char_list2 = [' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
num_distinct_words = 3438
max_words = 15
mean_words_per_headline = 6.5696


(5.0, 6.0, 8.0)

<pre>
X0 (1) = Index encoding.<br>
    Integer encoding of headlines from the dataset. This does not inherently contain
    about the headlines. This will be used with the string kernel so that GloVe
    embeddings may be referenced instead of created.

X1 (88) = Character-level integer encoding.<br>
    Characters embedded are spaces (" "), digits (1-10), and lowercase English alphabet (a-z).
    The number of distinct characters encoded in X1 is 37. Each integer encoding has a
    corresponding one-hot encoding: 0 = [1,0,0,...,0], 1 = [0,1,0,...,0], 2 = [0,0,1,...,0],
    and so on. The maximum number of characters in a headline is 85 (including spaces),
    so for all headlines with less than the maximum, the encoding is padded with 0s.
        Uses `headlines1`

X2 (15) = Word-level integer encoding.<br>
    This uses the same coding scheme as above, but now there are 3438 distinct words being
    encoded. This coding scheme was done using `keras_preprocessing` `text.hashing_trick`
    with arguments of 5000 for the hash vocabulary size and md5 as the selected hash function.
        Uses `headlines2`

X3 (50) = Averaged GloVe embeddings.<br>
    The emebeddings used are the 50-dimensional version. The tokens are generated using the
    `nltk.tokenize` `word_tokenize` method. In this embedding, punctuation was retained because
    embeddings exist for tokenized punctuation.
        Uses `headlines1`

X4 = (6) Doc2Vec embeddings using the distributed memory training algorithm.<br>
        Uses `headlines2`

X5 = (6) Doc2Vec embeddings using the distributed bag of words training algorithm.<br>
        Uses `headlines2`

X = torch.cat((X0,X1,X2,X3,X4,X5),1)

X_i     size    index
0       1       0
1       88      1-88
2       15      89-103
3       50      104-153
4       6       154-159
5       6       160-165
</pre>

In [5]:
X0 = torch.arange(n).unsqueeze(1)

In [6]:
X1 = torch.zeros(n,max_chars)

char_encoding = dict([(key, value) for value, key in enumerate(char_list)])

for i,headline in enumerate(headlines1):
    num_chars = len(headline)
    X1[i,0:num_chars] = torch.tensor([char_encoding[ch] for ch in headline])

print("Created character-level integer encodings")

Created character-level integer encodings


In [18]:
char_encoding
# set([len(headline) for headline in headlines_words])
sum([len(headline) for headline in headlines_words])/len([len(headline) for headline in headlines_words])

6.5696

In [17]:
from keras_preprocessing import text

X2 = torch.zeros(n,max_words)

hash_vocab_size = 5000

# text.one_hot(s1, 1000), text.hashing_trick(s1,1000,hash_function='md5')
# one_hot is wrapper function to hashing_trick function. one_hot uses default python 'hash' function
# 'md5' has function is a stable hashing function, while 'hash' is not
# vocab size determines the number of unique hash values. Number of unique hashes is vocab size minus 1 since
# `0` is a reserved index that won't be assigned to any word.
# https://github.com/keras-team/keras-preprocessing/blob/1.1.2/keras_preprocessing/text.py#L253-L267

for i, headline in enumerate(headlines2):
    num_words = len(headlines_words[i])
    X2[i,0:num_words] = torch.tensor(text.hashing_trick(headline, hash_vocab_size, hash_function='md5'))

print("Created word-level integer encodings")

Created word-level integer encodings


In [6]:
glove_dim = 50

X3 = torch.zeros(n,glove_dim)
word_level_glove_embs = []

from embedding_loading import EmbeddingLoading

# choose glove dimensions from (50, 100, 200, 300) and call in list.
# For example,
# glove_dim = [50, 100, 200, 300]
# [glove50, glove100, glove200, glove300] = EmbeddingLoading.load(glove_dim)

[glove] = EmbeddingLoading.load([glove_dim])
max_tokens = 18 # (determined empirically)

word_level_glove_embs = torch.zeros(n, max_tokens, glove_dim)

from nltk.tokenize import word_tokenize

for i in range(n):
    tokenization = word_tokenize(headlines1[i])
    embs = []
    for token in tokenization:
        emb = glove.get(token)
        if emb is None:
            continue
        embs.append(emb)
    embs = torch.tensor(embs)
    word_level_glove_embs[i,:embs.shape[0],:] = embs
    X3[i] = torch.mean(embs, 0)

torch.save(word_level_glove_embs, 'word-level_glove_embeddings.pt')

print('Saved word-level glove embeddings')
print('Created averaged glove embeddings')

Saved word-level glove embeddings
Created averaged glove embeddings


In [7]:
# https://radimrehurek.com/gensim/models/doc2vec.html
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# doc2vec_dim = int(mean_words_per_headline)
# doc2vec_dim = 4
dm_doc2vec_dim = 4
dbow_doc2vec_dim = 4

documents = [TaggedDocument(doc,[i]) for i, doc in enumerate(headlines_words)]

dm_model = Doc2Vec(documents, dm=1, vector_size=dm_doc2vec_dim,
                window=4, min_count=1, seed=1, workers=1)
dbow_model = Doc2Vec(documents, dm=0, vector_size=dbow_doc2vec_dim,
                     window=4, min_count=1, seed=1, workers=1)

# Parameters:
# dm ({1,0}, optional) – Defines the training algorithm. If dm=1, ‘distributed memory’
#  (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
# vector_size (int, optional) – Dimensionality of the feature vectors.
# window (int, optional) – The maximum distance between the current and predicted word
#   within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# seed (int, optional) – Seed for the random number generator. Initial vectors for each
#   word are seeded with a hash of the concatenation of word + str(seed). Note that for
#   a fully deterministically-reproducible run, you must also limit the model to a
#   single worker thread (workers=1), to eliminate ordering jitter from OS thread
#   scheduling. In Python 3, reproducibility between interpreter launches also
#   requires use of the PYTHONHASHSEED environment variable to control hash randomization.
# workers (int, optional) – Use these many worker threads to train the model
#   (=faster training with multicore machines).

X4 = torch.tensor(np.stack([dm_model.dv.get_vector(i, norm=True) for i in range(n)], 0))
print('Created distributed memory Doc2Vec embeddings')

X5 = torch.tensor(np.stack([dbow_model.dv.get_vector(i, norm=True) for i in range(n)], 0))
print('Created distributed bag of words Doc2Vec embeddings')

Created distributed memory Doc2Vec embeddings
Created distributed bag of words Doc2Vec embeddings


In [8]:
X4_3_to_12 = torch.zeros(10,1250,12)
X5_3_to_12 = torch.zeros(10,1250,12)

# use for generating doc2vecs of differing sizes
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc,[i]) for i, doc in enumerate(headlines_words)]

for doc2vec_dim in range(3,13):

    dm_model = Doc2Vec(documents, dm=1, vector_size=doc2vec_dim,
                    window=4, min_count=1, seed=1, workers=1)
    dbow_model = Doc2Vec(documents, dm=0, vector_size=doc2vec_dim,
                         window=4, min_count=1, seed=1, workers=1)

    X4_3_to_12[doc2vec_dim-3,:,:doc2vec_dim] = torch.tensor(np.stack([dm_model.dv.get_vector(i, norm=True) for i in range(n)], 0))

    X5_3_to_12[doc2vec_dim-3,:,:doc2vec_dim] = torch.tensor(np.stack([dbow_model.dv.get_vector(i, norm=True) for i in range(n)], 0))

torch.save(X4_3_to_12, 'X4_3_to_12_tensor')
torch.save(X5_3_to_12, 'X5_3_to_12_tensor')

# 4, 7, 9, 11 are top performers for both models
# size 4 seems to do the best for both models
# (refer to mogp_testing.jpyng)

In [9]:
X = torch.cat((X0,X1,X2,X3,X4,X5),1)
print(X.shape)

torch.Size([1250, 162])


In [10]:
torch.save(X, 'input_tensor.pt')
print("Tensor saved to \'input_tensor.pt\'")

Tensor saved to 'input_tensor.pt'


To be used in notebooks that utilize this one:

```
doc2vec_size = 4
X = torch.load('input_tensor.pt')
sizes = (1,88,15,50,doc2vec_size,doc2vec_size)
_distinct_embs = len(sizes)
_breaks = [sum(sizes[:i]) for i in range(_distinct_embs+1)]
indices = ((_breaks[i],_breaks[i+1]) for i in range(_distinct_embs))
(X0,X1,X2,X3,X4,X5) = (X[:,slice(*i)] for i in indices)

separation = 250
X_train, X_test = X[:separation], X[separation:]
Y_train, Y_test = Y[:separation], Y[separation:]
```


