# CommentNet-V3 

Welcome to the version 3 of CommentNet!

First let's import the relevant packages.


In [35]:
import csv
import string
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, Concatenate, Dot, RepeatVector
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import keras.backend as K
import coremltools

Next, let's process the data set.

We will process the data set and create a new data set by removing punctuation and trimming the lines in the data set.

In [36]:
def process_data(file_list_with_labels, output='../data/data_set.txt', output_label = '../data/labels.txt'):
    phrase = []
    emoji = []

    with open (output, 'w') as output_file, open(output_label, 'w') as label_file:
        
        for filename, label in file_list_with_labels:
            print('Processing file: ', filename)
            with open (filename) as f:
                  for line in f:
                    line = line.strip()
                    if len(line)>0:
                        table = str.maketrans({key: None for key in string.punctuation})
                        line = line.translate(table)
                        output_file.write(line + '\r\n')
                        label_file.write(label + '\r\n')
                        phrase.append(0)
    

Our data set files are in 3 separate sets of files.

- troll data set and it's labels
- constructive data set and it's labels
- positive data set and it's labels

We will process these 3 separate data sets and create one combined dataset and label set.

In [37]:
file_list_with_labels = []
file_list_with_labels.append(('../data/troll.txt', '1'))
file_list_with_labels.append(('../data/constructive.txt', '0'))
file_list_with_labels.append(('../data/positive.txt', '0'))
process_data(file_list_with_labels)

Processing file:  ../data/troll.txt
Processing file:  ../data/constructive.txt
Processing file:  ../data/positive.txt


Next, it's time to load the GloVe word embedding files.

In [38]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f, open('../data/glove_word_index.txt', 'w') as word_index_file:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        word_index = []
        for w in sorted(words):
            word_index.append(w + ' ' + str(i))
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
        
        for line in word_index:
            word_index_file.write(line + '\n')
    return words_to_index, index_to_words, word_to_vec_map, i

In [39]:
word_to_index, index_to_word, word_to_vec_map, unknown_word_index = read_glove_vecs('../../../CommentNetData/glove.6B/glove.6B.50d.txt')

Next, we will also read in a custom word embedding to be used for unkown word tokens.

In [40]:
def read_unknown_vecs(unknown_file):
    
    unknown_vector = None
    
    with open(unknown_file, 'r') as f:
        for line in f:
            line = line.strip().split()
            unknown_vector = np.array(line[0:], dtype=np.float64)
    
    print('Unknown word vector is', unknown_vector.shape)

    return unknown_vector

In [41]:
unkown_word_vector = read_unknown_vecs('../data/unknown_word_vector.txt')

Unknown word vector is (50,)


In [42]:
word = "ea"
index = 18
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])
print(unknown_word_index)
print(word_to_vec_map[word].reshape(1, -1).shape)
cosine_similarity(word_to_vec_map["bethesda"].reshape(1, -1), word_to_vec_map["ea"].reshape(1, -1))

the index of ea in the vocabulary is 132265
the 18th word in the vocabulary is #cccccc
400001
(1, 50)


array([[0.21020451]])

In [43]:
def sentences_to_indices(X, word_to_index, max_len, unknown_word_index):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j = j + 1
            else:
                X_indices[i, j] = unknown_word_index
                j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [44]:
def read_processed_data(data_set = '../data/data_set.txt', labels = '../data/labels.txt'):
    
    X = []
    Y = []
    
    with open(data_set) as data_file:
        for line in data_file:
            line = line.strip()
            if len(line) > 0:
                X.append(line)
    
    with open(labels) as label_file:
        for line in label_file:
            line = line.strip()
            if len(line) > 0:
                Y.append(line)
                
    X = np.asarray(X)
    Y = np.asarray(Y, dtype=int)
    
    return X, Y
    

In [45]:
X, Y = read_processed_data()

In [46]:
print('X : ', len(X))
print('Y : ', len(Y))

X :  150
Y :  150


In [47]:
X, Y = shuffle(X, Y, random_state=0)
print(X[len(X) - 1], Y[len(Y) - 1])

Sean Murray and Hello Games this silence is deafening 1


In [48]:
def find_max_len(X):
    
    max_len = 0
    max_line = None
    max_array = []
    
    
    for line in X:
        sentence_words =line.lower().split()
        if len(sentence_words) > max_len:
            max_len = len(sentence_words)
            max_line = line 
            max_array = sentence_words
    
    print ('Max length is ', max_len)
    print(max_line)
    print(max_array)
    
    return max_len

In [49]:
max_len = find_max_len(X)
Tx = max_len

Max length is  54
Look guys Anthem is in a bad place right now but remember to be civil towards the devs They are people too and probably under a bit of stress right now Voice your feedback but be considerate and most of all remember there is an actual living person on the other side of it
['look', 'guys', 'anthem', 'is', 'in', 'a', 'bad', 'place', 'right', 'now', 'but', 'remember', 'to', 'be', 'civil', 'towards', 'the', 'devs', 'they', 'are', 'people', 'too', 'and', 'probably', 'under', 'a', 'bit', 'of', 'stress', 'right', 'now', 'voice', 'your', 'feedback', 'but', 'be', 'considerate', 'and', 'most', 'of', 'all', 'remember', 'there', 'is', 'an', 'actual', 'living', 'person', 'on', 'the', 'other', 'side', 'of', 'it']


In [50]:
def create_train_dev_test(X, Y, split):
    
    X, Y = shuffle(X, Y)
    
    mode = split[0]
    
    if mode == 'tt':
        train_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_test = X[train_size:]
        Y_test = Y[train_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_test, Y_test)
        
    else:
        
        train_size, dev_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_dev = X[train_size:train_size + dev_size]
        Y_dev = Y[train_size:train_size + dev_size]
    
        X_test = X[train_size + dev_size:]
        Y_test = Y[train_size + dev_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of dev set : ', len(X_dev))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_dev, Y_dev, X_test, Y_test)

    
    return result
    

In [51]:
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = create_train_dev_test(X, Y, ['tvt', (100, 25, 25)])

Size of test set :  100
Size of dev set :  25
Size of train set :  25


In [52]:
print(X_train[len(X_train) - 1], Y_train[len(Y_train) - 1])

Seeing a lot of posts about players coming back to the game and now loving it Raise your hand if you never left because youve always loved the game 0


In [53]:
X_train, Y_train, X_test, Y_test = create_train_dev_test(X, Y, ['tt', (100, 50)])

Size of test set :  100
Size of train set :  50


In [54]:
print(Y_train.shape)

(100,)


In [55]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = unknown_word_index + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    emb_matrix[unknown_word_index, :] = unkown_word_vector

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

Define custom softmax function to be used in the Attention mechanism's alpha calculation step.

In [56]:
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

Define global variables for the attention step

In [88]:
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are nor using the custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

The function to calculate the context for the each step in the final Bi-LSTM layer

In [89]:
def one_step_attention(a, s_prev):
    """
    Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
    "alphas" and the hidden states "a" of the Bi-LSTM.
    
    Arguments:
    a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, n_a)
    s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
    
    Returns:
    context -- context vector, input of the next (post-attetion) LSTM cell
    """
    
    ### START CODE HERE ###
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
    s_prev = repeator(s_prev)
    # Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
    concat = concatenator([a, s_prev])
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
    e = densor1(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
    energies = densor2(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
    alphas = activator(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
    context = dotor([alphas, a])
    ### END CODE HERE ###
    
    return context

In [90]:
def CommentNet(input_shape, word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index):
    """
    Function creatiDabareng the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    a = Bidirectional(LSTM(128, return_sequences = True), merge_mode = 'ave')(embeddings)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    
    s0 = Input(shape=(128,), name='s0')
    c0 = Input(shape=(128,), name='c0')
    s = s0
    c = c0
    
    post_attention_Bi_LSTM_cell = LSTM(128, return_state = True)
    
    for t in range(Tx):
        
        context = one_step_attention(a, s)
        
        s, _, c = post_attention_Bi_LSTM_cell(context, initial_state = [s, c])
        
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(s)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(1, activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = [sentence_indices, s0, c0], outputs = X)
    
    ### END CODE HERE ###
    
    return model

In [91]:
model = CommentNet((max_len,), word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index)

In [92]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 54)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 54, 50)       20000100    input_4[0][0]                    
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 128)          0                                            
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 54, 128)      183296      embedding_4[0][0]                
__________________________________________________________________________________________________
repeat_vec

In [94]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len, unknown_word_index)

In [96]:
train_index = 60
print(X_train.shape[0])
print(X_train[train_index])
print(X_train_indices[train_index])

100
BioWare replace the current loading screens with pages from the Codex
[ 76699. 305876. 357266. 113946. 224439. 322184. 388711. 275169. 154323.
 357266. 104258.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.]


In [97]:
s0 = np.zeros((X_train.shape[0], 128))
c0 = np.zeros((X_train.shape[0], 128))

In [98]:
model.fit([X_train_indices, s0, c0], Y_train, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x16e958390>

In [99]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len, unknown_word_index)
s0 = np.zeros((X_test_indices.shape[0], 128))
c0 = np.zeros((X_test_indices.shape[0], 128))

In [100]:
loss, acc = model.evaluate([X_test_indices, s0, c0], Y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.5600000071525574


In [101]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len, unknown_word_index)
pred = model.predict([X_test_indices, s0, c0])

threshold = 0.5
Y_pred = pred > threshold


error_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    if(Y_pred[i] != Y_test[i]):
        error_count = error_count + 1
        print(X_test[i])
        print('Expected label:'+ str(Y_test[i]) + ' Prediction: ' + ' ' + str( 1 if Y_pred[i] else 0))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(error_count))

print('F1 Score for Trolling: ', f1_score(Y_test, Y_pred))

Y_test_inv = np.invert(Y_test > 0).reshape(-1,1)
Y_pred_inv = np.invert(Y_pred).reshape(-1,1)

print('Constructive feedback count : ', np.sum(Y_test_inv))
print('Predicted Constructive feedback count : ', np.sum(Y_pred_inv))
print('F1 Score Constructive Feedback: ', f1_score(Y_test_inv, Y_pred_inv))

Heres why I think people arent getting legendaries and a potential bug
Expected label:0 Prediction:  1
Idea don’t just increase the size of the stash but make it upgradable with caps and rare resources
Expected label:0 Prediction:  1
Bethesda should extend the beta time tonight
Expected label:1 Prediction:  0
My project which can automatically build maps of every resource in the game world
Expected label:0 Prediction:  1
Bioware has acknowledged that Ranger feels underperforming heres why its a mess
Expected label:0 Prediction:  1
I would happily wait another month with no new content to have a REAL quality and welltested stability and QoLonly patch
Expected label:0 Prediction:  1
PSA The Level 1 Defender Rifle is the best weapon in the gamealso damage numbers are pointless and dont mean anything
Expected label:0 Prediction:  1
Share your support  thanks to Hello Games a company that could have just given up but instead they just keep giving
Expected label:0 Prediction:  1
PowerScaling

In [86]:
correct_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    if(Y_pred[i] == Y_test[i]):
        print(X_test_indices[i])
        correct_count = correct_count + 1
        print(X_test[i])
        print('Expected label:'+ str(Y_test[i]) + ' Prediction: ' + ' ' + str(Y_pred[i]))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(correct_count))


[357266. 302292. 302476. 151349. 225709.  66984.  54718. 343872. 193642.
 188481.  56280.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.]
The real reason for LOOT BALANCE and STORE issues in Anthem
Expected label:1 Prediction:  [ True]
[223704. 141960. 357810. 319691. 357810. 149340. 192973.  84723.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.]
Literally eveything they

In [32]:
# serialize model to JSON
model_json = model.to_json()
with open("../models/model.json", "w") as json_file:
    print('works')
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../models/model.h5")
print("Saved model to disk")

works
Saved model to disk


In [34]:
coreml_model = coremltools.converters.keras.convert(model)

0 : input_1, <keras.engine.input_layer.InputLayer object at 0x13943ee48>
1 : embedding_1, <keras.layers.embeddings.Embedding object at 0x13946e080>
2 : bidirectional_1, <keras.layers.wrappers.Bidirectional object at 0x13946e358>
3 : bidirectional_2, <keras.layers.wrappers.Bidirectional object at 0x139d6b860>
4 : dense_1, <keras.layers.core.Dense object at 0x13946ecc0>
5 : dense_1__activation__, <keras.layers.core.Activation object at 0x13a464978>


In [35]:
coreml_model.save('../models/CommentNetV2.mlmodel')

In [36]:
print(coreml_model)

input {
  name: "input1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}
input {
  name: "bidirectional_1_h_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_c_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_h_in_rev"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_c_in_rev"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_h_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_c_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_h_

In [None]:
Sisira Dabare