In [1]:
import csv
import string
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import coremltools

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
W0805 11:11:29.173964 4559558080 __init__.py:71] TensorFlow version 1.14.0 detected. Last version known to be fully compatible is 1.13.1 .


In [2]:
def process_data(file_list_with_labels, output='../data/data_set.txt', output_label = '../data/labels.txt'):
    phrase = []
    emoji = []

    with open (output, 'w') as output_file, open(output_label, 'w') as label_file:
        
        for filename, label in file_list_with_labels:
            print('Processing file: ', filename)
            with open (filename) as f:
                  for line in f:
                    line = line.strip()
                    if len(line)>0:
                        table = str.maketrans({key: None for key in string.punctuation})
                        line = line.translate(table)
                        output_file.write(line + '\r\n')
                        label_file.write(label + '\r\n')
                        phrase.append(0)
    

In [3]:
def read_unknown_vecs(unknown_file):
    
    unknown_vector = None
    
    with open(unknown_file, 'r') as f:
        for line in f:
            line = line.strip().split()
            unknown_vector = np.array(line[0:], dtype=np.float64)
    
    print('Unknown word vector is', unknown_vector.shape)

    return unknown_vector

In [4]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f, open('../data/glove_word_index.txt', 'w') as word_index_file:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        word_index = []
        for w in sorted(words):
            word_index.append(w + ' ' + str(i))
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
        
        for line in word_index:
            word_index_file.write(line + '\n')
    return words_to_index, index_to_words, word_to_vec_map, i

In [5]:
word_to_index, index_to_word, word_to_vec_map, unknown_word_index = read_glove_vecs('../../../CommentNetData/glove.6B/glove.6B.50d.txt')

In [6]:
unkown_word_vector = read_unknown_vecs('../data/unknown_word_vector.txt')

Unknown word vector is (50,)


In [7]:
word = "ea"
index = 18
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])
print(unknown_word_index)
print(word_to_vec_map[word].reshape(1, -1).shape)
cosine_similarity(word_to_vec_map["bethesda"].reshape(1, -1), word_to_vec_map["ea"].reshape(1, -1))

the index of ea in the vocabulary is 132265
the 18th word in the vocabulary is #cccccc
400001
(1, 50)


array([[0.21020451]])

In [8]:
def sentences_to_indices(X, word_to_index, max_len, unknown_word_index):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j = j + 1
            else:
                X_indices[i, j] = unknown_word_index
                j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [9]:
file_list_with_labels = []
file_list_with_labels.append(('../data/troll.txt', '1'))
file_list_with_labels.append(('../data/constructive.txt', '0'))
file_list_with_labels.append(('../data/positive.txt', '0'))
process_data(file_list_with_labels)

Processing file:  ../data/troll.txt
Processing file:  ../data/constructive.txt
Processing file:  ../data/positive.txt


In [10]:
def read_processed_data(data_set = '../data/data_set.txt', labels = '../data/labels.txt'):
    
    X = []
    Y = []
    
    with open(data_set) as data_file:
        for line in data_file:
            line = line.strip()
            if len(line) > 0:
                X.append(line)
    
    with open(labels) as label_file:
        for line in label_file:
            line = line.strip()
            if len(line) > 0:
                Y.append(line)
                
    X = np.asarray(X)
    Y = np.asarray(Y, dtype=int)
    
    return X, Y
    

In [11]:
X, Y = read_processed_data()

In [12]:
print('X : ', len(X))
print('Y : ', len(Y))

X :  150
Y :  150


In [13]:
X, Y = shuffle(X, Y, random_state=0)
print(X[len(X) - 1], Y[len(Y) - 1])

Sean Murray and Hello Games this silence is deafening 1


In [14]:
def find_max_len(X):
    
    max_len = 0
    max_line = None
    max_array = []
    
    
    for line in X:
        sentence_words =line.lower().split()
        if len(sentence_words) > max_len:
            max_len = len(sentence_words)
            max_line = line 
            max_array = sentence_words
    
    print ('Max length is ', max_len)
    print(max_line)
    print(max_array)
    
    return max_len

In [15]:
max_len = find_max_len(X)

Max length is  54
Look guys Anthem is in a bad place right now but remember to be civil towards the devs They are people too and probably under a bit of stress right now Voice your feedback but be considerate and most of all remember there is an actual living person on the other side of it
['look', 'guys', 'anthem', 'is', 'in', 'a', 'bad', 'place', 'right', 'now', 'but', 'remember', 'to', 'be', 'civil', 'towards', 'the', 'devs', 'they', 'are', 'people', 'too', 'and', 'probably', 'under', 'a', 'bit', 'of', 'stress', 'right', 'now', 'voice', 'your', 'feedback', 'but', 'be', 'considerate', 'and', 'most', 'of', 'all', 'remember', 'there', 'is', 'an', 'actual', 'living', 'person', 'on', 'the', 'other', 'side', 'of', 'it']


In [16]:
def create_train_dev_test(X, Y, split):
    
    X, Y = shuffle(X, Y)
    
    mode = split[0]
    
    if mode == 'tt':
        train_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_test = X[train_size:]
        Y_test = Y[train_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_test, Y_test)
        
    else:
        
        train_size, dev_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_dev = X[train_size:train_size + dev_size]
        Y_dev = Y[train_size:train_size + dev_size]
    
        X_test = X[train_size + dev_size:]
        Y_test = Y[train_size + dev_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of dev set : ', len(X_dev))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_dev, Y_dev, X_test, Y_test)

    
    return result
    

In [17]:
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = create_train_dev_test(X, Y, ['tvt', (100, 25, 25)])

Size of test set :  100
Size of dev set :  25
Size of train set :  25


In [18]:
print(X_train[len(X_train) - 1], Y_train[len(Y_train) - 1])

Anthem reviews are seemingly harsher than other games because it failed at a time when gamers are just fed up with being overpromised and under delivered 1


In [19]:
X_train, Y_train, X_test, Y_test = create_train_dev_test(X, Y, ['tt', (100, 50)])

Size of test set :  100
Size of train set :  50


In [20]:
print(Y_train.shape)

(100,)


In [21]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = unknown_word_index + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    emb_matrix[unknown_word_index, :] = unkown_word_vector

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [22]:
def CommentNet(input_shape, word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences = True), merge_mode = 'ave')(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = Bidirectional(LSTM(128), merge_mode = 'ave')(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(1, activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    ### END CODE HERE ###
    
    return model

In [23]:
model = CommentNet((max_len,), word_to_vec_map, word_to_index, unkown_word_vector, unknown_word_index)

W0805 11:15:11.368124 4559558080 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0805 11:15:11.415276 4559558080 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0805 11:15:12.491998 4559558080 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0805 11:15:12.515288 4559558080 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorfl

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 54)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 54, 50)            20000100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 54, 128)           183296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 54, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               263168    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total para

In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0805 11:15:40.841098 4559558080 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0805 11:15:40.873406 4559558080 deprecation.py:323] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len, unknown_word_index)

In [27]:
train_index = 60
print(X_train[train_index])
print(X_train_indices[train_index])

You disgustingly have mentally broken your staff to the point of severe burn out and exhaustion
[394475. 125259. 174642. 241083.  84723. 394565. 341141. 360915. 357266.
 287479. 268046. 326150.  87324. 272930.  54718. 142571.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.]


In [28]:
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13bdda6a0>

In [29]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len, unknown_word_index)

In [30]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.7600000047683716


In [31]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len, unknown_word_index)
pred = model.predict(X_test_indices)

threshold = 0.5
Y_pred = pred > threshold


error_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    if(Y_pred[i] != Y_test[i]):
        error_count = error_count + 1
        #print(X_test[i])
        #print('Expected label:'+ str(Y_test[i]) + ' Prediction: ' + ' ' + str( 1 if Y_pred[i] else 0))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(error_count))

print('F1 Score for Trolling: ', f1_score(Y_test, Y_pred))

Y_test_inv = np.invert(Y_test > 0).reshape(-1,1)
Y_pred_inv = np.invert(Y_pred).reshape(-1,1)

print('Constructive feedback count : ', np.sum(Y_test_inv))
print('Predicted Constructive feedback count : ', np.sum(Y_pred_inv))
print('F1 Score Constructive Feedback: ', f1_score(Y_test_inv, Y_pred_inv))

Test count : 50
Error count : 12
F1 Score for Trolling:  0.8421052631578947
Constructive feedback count :  16
Predicted Constructive feedback count :  8
F1 Score Constructive Feedback:  0.5


In [42]:
correct_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    if(Y_pred[i] == Y_test[i]):
        print(X_test_indices[i])
        correct_count = correct_count + 1
        print(X_test[i])
        print('Expected label:'+ str(Y_test[i]) + ' Prediction: ' + ' ' + str(Y_pred[i]))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(correct_count))


[ 51582.  26559. 196161. 146344. 106329.  66985. 264550. 269953. 268046.
 357388. 323431. 360915.  71090. 270434. 185457. 357970. 358160. 192973.
 337302. 357226. 163745. 360915.  71090. 371152.  57891. 385218. 127491.
  76699.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.]
All 4 Javelins feel completely balanced Not one of them seems to be OP I think this is something thats going to be under appreciated Well done Bioware
Expected label:0 Prediction:  [False]
[386315. 173063. 388711. 357761. 357965. 200035.  43010. 147468. 142285.
 268046. 386307. 386112. 323432. 292067. 357226.  44764. 154323. 357266.
 157049. 384374. 164934.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0.      0.      0.      0.      0.      0.      0.      0.      0.
      0

In [32]:
# serialize model to JSON
model_json = model.to_json()
with open("../models/model.json", "w") as json_file:
    print('works')
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../models/model.h5")
print("Saved model to disk")

works
Saved model to disk


In [34]:
coreml_model = coremltools.converters.keras.convert(model)

0 : input_1, <keras.engine.input_layer.InputLayer object at 0x13943ee48>
1 : embedding_1, <keras.layers.embeddings.Embedding object at 0x13946e080>
2 : bidirectional_1, <keras.layers.wrappers.Bidirectional object at 0x13946e358>
3 : bidirectional_2, <keras.layers.wrappers.Bidirectional object at 0x139d6b860>
4 : dense_1, <keras.layers.core.Dense object at 0x13946ecc0>
5 : dense_1__activation__, <keras.layers.core.Activation object at 0x13a464978>


In [35]:
coreml_model.save('../models/CommentNetV2.mlmodel')

In [36]:
print(coreml_model)

input {
  name: "input1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}
input {
  name: "bidirectional_1_h_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_c_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_h_in_rev"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_1_c_in_rev"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_h_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_c_in"
  type {
    multiArrayType {
      shape: 128
      dataType: DOUBLE
    }
    isOptional: true
  }
}
input {
  name: "bidirectional_2_h_