In [2]:
import csv
import numpy as np
from sklearn.utils import shuffle
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

In [3]:
def process_data(file_list_with_labels, output='../data/data_set.txt', output_label = '../data/labels.txt'):
    phrase = []
    emoji = []

    with open (output, 'w') as output_file, open(output_label, 'w') as label_file:
        
        for filename, label in file_list_with_labels:
            print('Processing file: ', filename)
            with open (filename) as f:
                  for line in f:
                    line = line.strip()
                    if len(line)>0:
                        line = line.replace(",","")
                        output_file.write(line + '\r\n')
                        label_file.write(label + '\r\n')
                        phrase.append(0)
    

In [4]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [5]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../data/glove.6B/glove.6B.50d.txt')

In [6]:
word = "guys"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])
print(len(word_to_index))

the index of guys in the vocabulary is 169754
the 289846th word in the vocabulary is potatos
400000


In [7]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [8]:
file_list_with_labels = []
file_list_with_labels.append(('../data/troll.txt', '1'))
file_list_with_labels.append(('../data/constructive.txt', '0'))
file_list_with_labels.append(('../data/positive.txt', '0'))
process_data(file_list_with_labels)

Processing file:  ../data/troll.txt
Processing file:  ../data/constructive.txt
Processing file:  ../data/positive.txt


In [9]:
def read_processed_data(data_set = '../data/data_set.txt', labels = '../data/labels.txt'):
    
    X = []
    Y = []
    
    with open(data_set) as data_file:
        for line in data_file:
            line = line.strip()
            if len(line) > 0:
                X.append(line)
    
    with open(labels) as label_file:
        for line in label_file:
            line = line.strip()
            if len(line) > 0:
                Y.append(line)
                
    X = np.asarray(X)
    Y = np.asarray(Y, dtype=int)
    
    return X, Y
    

In [10]:
X, Y = read_processed_data()

In [11]:
print('X : ', len(X))
print('Y : ', len(Y))

X :  150
Y :  150


In [12]:
X, Y = shuffle(X, Y, random_state=0)
print(X[len(X) - 1], Y[len(Y) - 1])

Sean Murray and Hello Games this silence is deafening. 1


In [13]:
def find_max_len(X):
    
    max_len = 0
    max_line = None
    max_array = []
    
    
    for line in X:
        sentence_words =line.lower().split()
        if len(sentence_words) > max_len:
            max_len = len(sentence_words)
            max_line = line 
            max_array = sentence_words
    
    print ('Max length is ', max_len)
    print(max_line)
    print(max_array)
    
    return max_len

In [14]:
max_len = find_max_len(X)

Max length is  54
Look guys. Anthem is in a bad place right now but remember to be civil towards the devs. They are people too and probably under a bit of stress right now. Voice your feedback but be considerate and most of all remember there is an actual living person on the other side of it.
['look', 'guys.', 'anthem', 'is', 'in', 'a', 'bad', 'place', 'right', 'now', 'but', 'remember', 'to', 'be', 'civil', 'towards', 'the', 'devs.', 'they', 'are', 'people', 'too', 'and', 'probably', 'under', 'a', 'bit', 'of', 'stress', 'right', 'now.', 'voice', 'your', 'feedback', 'but', 'be', 'considerate', 'and', 'most', 'of', 'all', 'remember', 'there', 'is', 'an', 'actual', 'living', 'person', 'on', 'the', 'other', 'side', 'of', 'it.']


In [15]:
def create_train_dev_test(X, Y, split):
    
    X, Y = shuffle(X, Y)
    
    mode = split[0]
    
    if mode == 'tt':
        train_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_test = X[train_size:]
        Y_test = Y[train_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_test, Y_test)
        
    else:
        
        train_size, dev_size, test_size = split[1]
        
        X_train = X[:train_size]
        Y_train = Y[:train_size]
    
        X_dev = X[train_size:train_size + dev_size]
        Y_dev = Y[train_size:train_size + dev_size]
    
        X_test = X[train_size + dev_size:]
        Y_test = Y[train_size + dev_size:]
        
        print('Size of test set : ', len(X_train))
        print('Size of dev set : ', len(X_dev))
        print('Size of train set : ', len(X_test))
    
        result = (X_train, Y_train, X_dev, Y_dev, X_test, Y_test)

    
    return result
    

In [16]:
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = create_train_dev_test(X, Y, ['tvt', (100, 25, 25)])

Size of test set :  100
Size of dev set :  25
Size of train set :  25


In [17]:
print(X_train[len(X_train) - 1], Y_train[len(Y_train) - 1])

To prevent EA from astroturfing/planting questions in the upcoming AMA the mods of this subreddit should create a thread for what questions we want answered post that list when the AMA goes up then delete any other comment thats not it forcing EA to either ditch the AMA or answer the questions. 1


In [18]:
X_train, Y_train, X_test, Y_test = create_train_dev_test(X, Y, ['tt', (100, 50)])

Size of test set :  100
Size of train set :  50


In [19]:
print(Y_train.shape)

(100,)


In [20]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [21]:
def CommentNet(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences = True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(1, activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    ### END CODE HERE ###
    
    return model

In [22]:
model = CommentNet((max_len,), word_to_vec_map, word_to_index)

W0727 16:44:01.449267 4445447616 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0727 16:44:01.487322 4445447616 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 16:44:02.333242 4445447616 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 16:44:02.353927 4445447616 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/backend/tensorfl

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 54)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 54, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 54, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 54, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total para

In [24]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

W0727 16:44:03.359050 4445447616 deprecation_wrapper.py:119] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0727 16:44:03.385862 4445447616 deprecation.py:323] From /Users/dulithadabare/anaconda3/envs/comment-net-nomkl/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [25]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len)

In [34]:
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13070f4e0>

In [39]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = max_len)

In [43]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.6999999952316284
[0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 0 0 0 1]


In [41]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len)
pred = model.predict(X_test_indices)
error_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        error_count = error_count + 1
        print('Expected label:'+ str(Y_test[i]) + ' prediction: '+ X_test[i] + ' ' + str(num))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(error_count))

Expected label:1 prediction: Just reverse the last update and let us at least enjoy the game pre patch 11. Then come back again after testing 0
Expected label:1 prediction: I’m sorry but this game needs a solo mode. 0
Expected label:1 prediction: You manifested a game littered with game-breaking errors 0
Expected label:1 prediction: Bethesda should extend the beta time tonight 0
Expected label:1 prediction: The real reason for LOOT BALANCE and STORE issues in Anthem 0
Expected label:1 prediction: Let's be perfectly clear here EA is 100% at fault for the state of the game not gamers. 0
Expected label:1 prediction: Enough with the false advertising Bethesda!! 0
Expected label:1 prediction: Bethesda quit testing the water with the shitty little kits to see how far you can push gameplay influencing shop items. 0
Expected label:1 prediction: Time for a Change of Leadership in Fallout 76 Development 0
Expected label:1 prediction: The "Biggest disappointment" award on Steam 0
Expected label:1

In [42]:
correct_count = 0;
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num == Y_test[i]):
        error_count = error_count + 1
        print('Expected label:'+ str(Y_test[i]) + ' prediction: '+ X_test[i] + ' ' + str(num))
print('Test count : ' + str(len(X_test)))
print('Error count : ' + str(correct_count))

Expected label:0 prediction: BioWare - Can Colossus PLEASE get Shield Customization as our 4th armor Piece! 0
Expected label:0 prediction: [PSA] The Level 1 Defender Rifle is the best weapon in the game(also damage numbers are pointless and don't mean anything) 0
Expected label:0 prediction: Remember there are real people who poured their heart and soul into this game. 0
Expected label:0 prediction: Bioware has acknowledged that Ranger feels underperforming: here's why it's a mess 0
Expected label:0 prediction: Bethesda... Bethesda never changes. 0
Expected label:0 prediction: Here's why I think people aren't getting legendaries and a potential bug 0
Expected label:0 prediction: More Objective Radar variations based on the original design 0
Expected label:0 prediction: I am a day 1 player of NMS and I wanted to show my love for the game developers and the community by making a piece of art for NEXT. I hope you enjoy! 0
Expected label:0 prediction: All 4 Javelins feel completely balance