In [1]:
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

Using TensorFlow backend.


In [2]:
# Define documents
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!',
        'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']

# Define class labels
labels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [3]:
own_embedding_vocab_size = 10
encoded_docs_oe = [one_hot(d, own_embedding_vocab_size) for d in docs]
print(encoded_docs_oe)

[[3, 5], [9, 7], [4, 4], [1, 7], [6], [6], [8, 4], [8, 9], [8, 7], [1, 1, 5, 9]]


In [5]:
maxlen = 5
padded_docs_oe = pad_sequences(encoded_docs_oe, maxlen=maxlen, padding='post')
print(padded_docs_oe)

[[3 5 0 0 0]
 [9 7 0 0 0]
 [4 4 0 0 0]
 [1 7 0 0 0]
 [6 0 0 0 0]
 [6 0 0 0 0]
 [8 4 0 0 0]
 [8 9 0 0 0]
 [8 7 0 0 0]
 [1 1 5 9 0]]


In [7]:
model = Sequential()
model.add(Embedding(input_dim=own_embedding_vocab_size,
                    output_dim=32, 
                    input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])  # Compile the model
print(model.summary())  # Summarize the model
model.fit(padded_docs_oe, labels, epochs=50, verbose=0)  # Fit the model
loss, accuracy = model.evaluate(padded_docs_oe, labels, verbose=0)  # Evaluate the model
print('Accuracy: %0.3f' % accuracy)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 32)             320       
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 0.800


# aggressive dataset

In [11]:
import pandas as pd
train_df = pd.read_csv('../aggression-identification/agr_en_train.csv', header=None)
test_df = pd.read_csv('../aggression-identification/agr_en_dev.csv', header=None)

In [66]:
x_train_msg = list(train_df[1])
y_train_msg = list(train_df[2])

In [80]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
y_train_msg_encoded = encoder.fit_transform(y_train_msg)
print(y_train_msg_encoded)

[[0 0 1]
 [0 1 0]
 [0 0 1]
 ..., 
 [0 0 1]
 [0 0 1]
 [0 1 0]]


In [74]:
own_embedding_vocab_size = 21766
encoded_docs_oe = [one_hot(d, own_embedding_vocab_size) for d in x_train_msg]

In [75]:
maxlen = 1126
padded_docs_oe = pad_sequences(encoded_docs_oe, maxlen=maxlen, padding='post')
print(padded_docs_oe)

[[17649 14312 19274 ...,     0     0     0]
 [21709 17180  1802 ...,     0     0     0]
 [15389  2027 12448 ...,     0     0     0]
 ..., 
 [16671 18608     0 ...,     0     0     0]
 [ 6721  1252  2415 ...,     0     0     0]
 [12756 13013   850 ...,     0     0     0]]


In [82]:
model = Sequential()
model.add(Embedding(input_dim=own_embedding_vocab_size,
                    output_dim=32, 
                    input_length=maxlen))
model.add(Flatten())
model.add(Dense(3, activation='sigmoid'))

In [85]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])  # Compile the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1126, 32)          696512    
_________________________________________________________________
flatten_3 (Flatten)          (None, 36032)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 108099    
Total params: 804,611
Trainable params: 804,611
Non-trainable params: 0
_________________________________________________________________


In [90]:
history = model.fit(padded_docs_oe, y_train_msg_encoded, epochs=10, verbose=1)  # Fit the model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [91]:
loss, accuracy = model.evaluate(padded_docs_oe, y_train_msg_encoded, verbose=1)    # Evaluate the model
print('Accuracy: %0.3f' % accuracy)



### with GloVe Embeddings

In [94]:
def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - generate_matrix: whether to generate an embedding matrix
        Outputs:
                - word2coefs: Dictionary. Word to its corresponding coefficients
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    
    if include_empty_char:
        word2index[''] = len(word2index)
    
    # Second, build the "embedding_matrix"
    
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    
    # return word2coefs, word2index, embedding_matrix
    return word2index, np.asarray(embedding_matrix)

In [97]:
GLOVE_DIR = "/Users/dsbatista/resources/glove.6B/"
file = 'glove.6B.50d.txt'

word2index, embedding_matrix = load_glove_embeddings(GLOVE_DIR+file, embedding_dim=50)

In [None]:
def custom_tokenize(docs):
    output_matrix = []
    for d in docs:
        indices = []
        for w in d.split():
            word = re.sub(r'[^\w\s]','',w).lower()
            if word in word2index:
                w_index = word2index[word]
            else:
                w_index = 400000
            indices.append(w_index)
            output_matrix.append(indices)
    return output_matrix
    
# Encode docs with our special "custom_tokenize" function
encoded_docs_ge = custom_tokenize(x_train_msg)
print(encoded_docs_ge)