# Imports

In [1]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from collections import defaultdict, UserDict
import mygrad as mg
from pathlib import Path
import time
import pickle

from noggin import create_plot
%matplotlib notebook

# Preparing to use GloVe

In [3]:
# download glove.twitter.27B here: https://www.kaggle.com/fullmetal26/glovetwitter27b100dtxt
# and save it to /dat folder

# this should take 2-3 minutes
t0 = time.time()
unzipped_folder = "./dat/" # ENTER THE PATH TO THE UNZIPPED `glove.twitter.27B` HERE

# use glove2word2vec to convert GloVe vectors in text format into the word2vec text format:
if not Path('gensim_glove_vectors_200.txt').exists():
    
    # assumes you've downloaded and extracted the glove stuff
    glove2word2vec(glove_input_file= unzipped_folder + "glove.twitter.27B.200d.txt", 
               word2vec_output_file="gensim_glove_vectors_200.txt")

# read the word2vec txt to a gensim model using KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors_200.txt", binary=False)
t1 = time.time()
print("Time elapsed:", (t1-t0))

  glove2word2vec(glove_input_file= unzipped_folder + "glove.twitter.27B.200d.txt",


Time elapsed: 372.7735695838928


# Preparing Training and Test Data
### (Not needed if model parameters have already been saved)

In [2]:
from preprocess import new_preprocess # preprocess is a .py file we created
import pickle

x_test, test_max = new_preprocess("./dat/test_twitter_data.npz")
x_train, train_max = new_preprocess("./dat/train_twitter_data.npz")

with np.load("./dat/train_twitter_label.npz") as f:
    y_train = f["arr_0"].astype(int)

with np.load("./dat/test_twitter_label.npz") as f:
    y_test = f["arr_0"].astype(int)

In [7]:
print(train_max, test_max)
# I think this number corresponds to the length of the longest twitter message in the training/test sets

77 28


In [8]:
x_train[0], y_train[0] # example data point

(('i', 'missed', 'new', 'moon', 'trailer'), 0)

# Defining our Model

In [9]:
from mynn.layers.conv import conv
from mynn.layers.dense import dense
from mynn.activations import relu
from mygrad.nnet.layers import max_pool
from mynn.activations import sigmoid
from mynn.initializers import glorot_normal
from mynn.optimizers import Adam

class Model:
    def __init__(self):
        """ Initializes model layers and weights. """
        # <COGINST>
        init_kwargs = {'gain': np.sqrt(2)}
        self.conv1 = conv(200, 250, 3, stride = 1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense1 = dense(250, 250, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        self.dense2 = dense(250,1, weight_initializer = glorot_normal, weight_kwargs = init_kwargs)
        # </COGINST>
    
    
    def __call__(self, x):
        """ Forward data through the network.
        
        This allows us to conveniently initialize a model `m` and then send data through it
        to be classified by calling `m(x)`.
        
        Parameters
        ----------
        x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, S)
            The data to forward through the network.
            
        Returns
        -------
        mygrad.Tensor, shape=(N, 1)
            The model outputs.
        
        Notes
        -----
        N = batch size
        D = embedding size
        S = sentence length
        """
        # <COGINST>
        # (N, D, S) with D = 200 and S = 77
        x = self.conv1(x) # conv output shape (N, F, S') with F = 250 and S' = 75
        x = relu(x)
        x = max_pool(x, (x.shape[-1],), 1) # global pool output shape (N, F, S') with F = 250, S' = 1
        x = x.reshape(x.shape[0], -1)  # (N, F, 1) -> (N, F)
        x = self.dense1(x) # (N, F) @ (F, D1) = (N, D1)
        x = relu(x) 
        x = self.dense2(x) # (N, D1) @ (D1, 1) = (N, 1)
        x = sigmoid(x)
        return x # output shape (N, 1)
        # </COGINST>
    
    
    @property
    def parameters(self, load = None):
        """ A convenience function for getting all the parameters of our model. """
        return self.conv1.parameters + self.dense1.parameters + self.dense2.parameters # <COGLINE>

# Functions for Accuracy and Binary Cross Entropy Loss

In [10]:
def accuracy(pred, truth):
    """ Calculates the accuracy of the predicted sentiments.
    
    Parameters
    ----------
    pred: Union[numpy.ndarry, mygrad.Tensor]
        The prediction scores of sentiments of the tweets (as a float from 0 to 1)
    
    truth: numpy.ndarry
        The true tweet sentiment (0 or 1)
    
    Returns
    -------
    float
        The accuracy of the predictions
    """
    # <COGINST>
    if isinstance(pred, mg.Tensor):
        pred = pred.data
    return np.mean(np.round(pred) == truth)
    # </COGINST>

def binary_cross_entropy(y_pred, y_truth):
    """ Calculates the binary cross entropy loss for a given set of predictions.
    
    Parameters
    ----------
    y_pred: mg.Tensor, shape=
        The Tensor of class scores output from the model
    
    y_truth: mg.Tensor, shape=
        A constant Tensor or a NumPy array that contains the truth values for each prediction
    
    Returns
    -------
    mg.Tensor, shape=()
        A zero-dimensional tensor that is the loss
    """
    return -mg.mean(y_truth * mg.log(y_pred + 1e-08) + (1 - y_truth) * mg.log(1 - y_pred + 1e-08)) # <COGLINE>

# Training and plotting a new model
### (Not needed if model parameters have already been saved)

In [None]:
model = Model()
optim = Adam(model.parameters, learning_rate = 1e-4)
plotter, fig, ax = create_plot(metrics=["loss", "accuracy"])

In [None]:
t0 = time.time()

batch_size = 50

for epoch_cnt in range(7):
    te = time.time()
    idxs = np.arange(len(x_train))
    np.random.shuffle(idxs)
       
    for batch_cnt in range(len(x_train)//batch_size):
        # make slice object so indices can be referenced later
        batch_indices = slice(batch_cnt * batch_size, (batch_cnt + 1) * batch_size)
        batch = x_train[batch_indices]  # random batch of our training data
        
        # retrieve glove embeddings for batch
        # <COGINST>
        # initialize every value as small number which will be the placeholder for not found embeddings
        arr = np.ones((len(batch), 200, max(train_max, test_max))) / 1000000
        for i, sent in enumerate(batch):
            for j , word in enumerate(sent):   
                # retrieve glove embedding for every word in sentence
                try:
                    arr[i,:,j] = glove_model.get_vector(word.lower())
                
                # continue if glove embedding not found
                except Exception as e:
                    continue
        # </COGINST>
        
        
        # pass model through batch and perform gradient descent
        # <COGINST>
        pred = model(arr)
        truth = y_train[batch_indices]
        
        loss = binary_cross_entropy(pred[:,0], truth)
        loss.backward()

        optim.step()
        #loss.null_gradients()
        
        acc = accuracy(pred[:,0], truth)
        # </COGINST>
        
        # pass loss and accuracy to noggin for plotting
        plotter.set_train_batch({"loss" : loss.item(),
                                 "accuracy" : acc},
                                 batch_size=batch_size)
    
    
    # compute test statistics
    idxs = np.arange(len(x_test))
    for batch_cnt in range(0, len(x_test) // batch_size):
        batch_indices = slice(batch_cnt * batch_size, (batch_cnt + 1) * batch_size)
        batch = x_test[batch_indices]
        
        # again, find embeddings for batch
        # <COGINST>
        arr = np.ones((len(batch), 200, max(train_max, test_max))) / 1000000
        for i, sent in enumerate(batch):
            for j , word in enumerate(sent):   
                try:
                    arr[i,:,j] = glove_model.get_vector(word.lower())
                
                except Exception as e:
                    continue
        # </COGINST>
        
        # perform forward pass and find accuracy but DO NOT backprop
        # <COGINST>
        pred = model(arr)
        truth = y_test[batch_indices]
        acc = accuracy(pred[:,0], truth)
        # </COGINST>

        # log the test-accuracy in noggin
        plotter.set_test_batch({"accuracy" : acc},
                                 batch_size=batch_size)
   
    # plot the epoch-level train/test statistics
    t3 = time.time()
    plotter.set_train_epoch()
    plotter.set_test_epoch()
    
    # takes 3-5 minutes per epoch
    print("Epoch", epoch_cnt, ": ", (t3-te))
tt = time.time()
print("Total time: ", (tt-t0))

## Saving model parameters

In [None]:
import pickle

# conv layer
filename1 = "7_epoch_cnn_model_parameters_conv1_weights.pkl"
filename2 = "7_epoch_cnn_model_parameters_conv1_bias.pkl"

pickle.dump(model.parameters[0].data, open(filename1, 'wb'))
pickle.dump(model.parameters[1].data, open(filename2, 'wb'))

# first dense layer
filename3 = "7_epoch_cnn_model_parameters_dense1_weights.pkl"
filename4 = "7_epoch_cnn_model_parameters_dense1_bias.pkl"

pickle.dump(model.parameters[2].data, open(filename3, 'wb'))
pickle.dump(model.parameters[3].data, open(filename4, 'wb'))

# second dense layer
filename5 = "7_epoch_cnn_model_parameters_dense2_weights.pkl"
filename6 = "7_epoch_cnn_model_parameters_dense2_bias.pkl"

pickle.dump(model.parameters[4].data, open(filename5, 'wb'))
pickle.dump(model.parameters[5].data, open(filename6, 'wb'))

# Loading and using model parameters if they have already been saved

In [13]:
file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_conv1_weights.pkl", "rb")
conv1w = mg.tensor(pickle.load(file))

file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_conv1_bias.pkl", "rb")
conv1b = mg.tensor(pickle.load(file))

file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_dense1_weights.pkl", "rb")
dense1w = mg.tensor(pickle.load(file))

file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_dense1_bias.pkl", "rb")
dense1b = mg.tensor(pickle.load(file))

file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_dense2_weights.pkl", "rb")
dense2w = mg.tensor(pickle.load(file))

file = open("7 Epoch Model Parameters/7_epoch_cnn_model_parameters_dense2_bias.pkl", "rb")
dense2b = mg.tensor(pickle.load(file))

### Initializing model with saved weights and biases

In [14]:
model = Model()
model.conv1.weight = conv1w
model.conv1.bias = conv1b
model.dense1.weight = dense1w
model.dense1.bias = dense1b
model.dense2.weight = dense2w
model.dense2.bias = dense2b

# Testing model on our dataset of songs

In [17]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import re, string

punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))

def tokenize_without_stopwords(lyrics):
    tokenized = punc_regex.sub('', lyrics).lower().split()
    return tuple(word for word in tokenized if not word in stopwords.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\evelyn.zhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Getting all .txt files of songs

In [18]:
import glob

In [19]:
test_truth = []

In [20]:
t0 = time.time()
path = 'Negative Chorus/*.txt'
files = glob.glob(path, recursive=True)

all_test_songs = dict()

for file in files:
    f = open(file, "r")
    chorus = f.read().replace("\n", " ")
    song_title = file[:-4].replace("Negative Chorus\\", "") # remove ".txt" to make song title nicer
    tokenized_chorus = tokenize_without_stopwords(chorus)
    
    all_test_songs[song_title] = tokenized_chorus
    test_truth.append(0)
    
    f.close()
t1 = time.time()

# should take 5-7 seconds
print("Time to get negative files:", (t1-t0))

Time to get negative files: 6.2759435176849365


In [21]:
t0 = time.time()
path = 'Positive Chorus/*.txt'
files = glob.glob(path, recursive=True)

for file in files:
    f = open(file, "r")
    chorus = f.read().replace("\n", " ")
    song_title = file[:-4].replace("Positive Chorus\\", "")
    tokenized_chorus = tokenize_without_stopwords(chorus)
    
    all_test_songs[song_title] = tokenized_chorus
    test_truth.append(1)
    
    f.close()
t1 = time.time()

# should take 5-7 seconds
print("Time to get positive files:", (t1-t0))

Time to get positive files: 5.936893463134766


In [22]:
test_songs_list = list(all_test_songs.values())

In [23]:
max_test_song_length = 0

for song in test_songs_list:
    max_test_song_length = max(max_test_song_length, len(song))
    
print(max_test_song_length)

45


In [24]:
results_dictionary = dict()
y_pred = []

### Testing our batch, i.e. all the songs in our song dataset

In [25]:
batch_size = len(test_songs_list)

idxs = np.arange(len(test_songs_list))
for batch_cnt in range(0, len(test_songs_list) // batch_size):
    batch_indices = slice(batch_cnt * batch_size, (batch_cnt + 1) * batch_size)
    batch = test_songs_list[batch_indices]

    # again, find embeddings for batch
    # <COGINST>
    arr = np.ones((len(batch), 200, max(train_max, max_test_song_length))) / 1000000
    for i, sent in enumerate(batch):
        for j , word in enumerate(sent):   
            try:
                arr[i,:,j] = glove_model.get_vector(word.lower())

            except Exception as e:
                continue
    # </COGINST>

    # perform forward pass and find accuracy but DO NOT backprop
    # <COGINST>
    pred = model(arr)
    
    for i in range(len(pred)):
        chorus_tuple = batch[i]
        rating = pred[i].data[0]
        for song_title, chorus in all_test_songs.items():
            if chorus == chorus_tuple:
                title = song_title
        
        results_dictionary[title] = rating
        if rating >= 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)

In [26]:
print(results_dictionary)

{'boulevard_of_broken_dreams': 0.14658554015935324, 'fireflies': 0.28790143190196366, 'good_4_u': 0.12991522636689745, 'hold_on': 0.49959040077143185, 'how_to_save_a_life': 0.14667782342307714, 'impossible': 0.5534536766073173, 'leave_before_you_love_me': 0.17176937889162314, 'let_her_go': 0.07049778260739106, 'let_me_down_slowly': 0.06440645467956202, 'lose_somebody': 0.10108339619576212, 'love_story': 0.2792851031232263, 'love_yourself': 0.436887707928041, 'sad_song': 0.019730743111728365, 'say_something': 0.2611522817905001, 'silence': 0.6170726494164878, 'someone_like_you': 0.5492538631652352, 'someone_you_loved': 0.3191532408933549, 'stitches': 0.34754042013033054, 'supermarket_flowers': 0.8898542007719811, 'too_good_at_goodbyes': 0.14847005820436204, 'train_wreck': 0.3162959863271057, 'viva_la_vida': 0.20458104586359796, 'waving_through_a_window': 0.7511672292463996, 'we_dont_talk_anymore': 0.25068336687269727, 'you_broke_me_first': 0.012002878422743404, '22': 0.3220256925627986,

In [27]:
# Cleaner visualization of results
for key in results_dictionary:
    print(key, ":", results_dictionary[key])

boulevard_of_broken_dreams : 0.14658554015935324
fireflies : 0.28790143190196366
good_4_u : 0.12991522636689745
hold_on : 0.49959040077143185
how_to_save_a_life : 0.14667782342307714
impossible : 0.5534536766073173
leave_before_you_love_me : 0.17176937889162314
let_her_go : 0.07049778260739106
let_me_down_slowly : 0.06440645467956202
lose_somebody : 0.10108339619576212
love_story : 0.2792851031232263
love_yourself : 0.436887707928041
sad_song : 0.019730743111728365
say_something : 0.2611522817905001
silence : 0.6170726494164878
someone_like_you : 0.5492538631652352
someone_you_loved : 0.3191532408933549
stitches : 0.34754042013033054
supermarket_flowers : 0.8898542007719811
too_good_at_goodbyes : 0.14847005820436204
train_wreck : 0.3162959863271057
viva_la_vida : 0.20458104586359796
waving_through_a_window : 0.7511672292463996
we_dont_talk_anymore : 0.25068336687269727
you_broke_me_first : 0.012002878422743404
22 : 0.3220256925627986
all_of_me : 0.9362327841677637
a_thousand_years : 0.

## Confusion Matrix

In [34]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(test_truth, y_pred)
print(cf_matrix)

[[20  5]
 [ 9 16]]


In [35]:
# contains labels, counts, and percentages
group_names = ["True Negative","False Positive","False Negative","True Positive"]
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='Blues')

<IPython.core.display.Javascript object>

<AxesSubplot:>