In [2]:
import json
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import os
import tensorflow as tf
from tensorflow import keras


os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [4]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [5]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
            else:
                X_indices[i, j] = word_to_index["unk"]
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [6]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [7]:
    def TextClassifier(input_shape, word_to_vec_map, word_to_index):
        """
        Function creating the Emojify-v2 model's graph.

        Arguments:
        input_shape -- shape of the input, usually (max_len,)
        word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
        word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

        Returns:
        model -- a model instance in Keras
        """

        ### START CODE HERE ###
        # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
        sentence_indices = Input(input_shape, dtype='int32')

        # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
        embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

        # Propagate sentence_indices through your embedding layer, you get back the embeddings
        embeddings = embedding_layer(sentence_indices)   

        # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
        # Be careful, the returned output should be a batch of sequences.
        X = LSTM(128, return_sequences=True)(embeddings)
        # Add dropout with a probability of 0.5
        X = Dropout(0.5)(X)
        # Propagate X trough another LSTM layer with 128-dimensional hidden state
        # Be careful, the returned output should be a single hidden state, not a batch of sequences.
        X = LSTM(128, return_sequences=False)(X)
        # Add dropout with a probability of 0.5
        X = Dropout(0.5)(X)
        # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
        X = Dense(2)(X)
        # Add a softmax activation
        X = Activation('softmax')(X)

        # Create Model instance which converts sentence_indices into X.
        model = Model(inputs=sentence_indices, outputs=X)

        ### END CODE HERE ###

        return model

In [10]:
def load_training_data(filename = 'data/SARCASM.json',maxLength = 200):
    X = []
    Y = []
    fhand = open(filename,encoding='utf8')
    for line in fhand:
        data = json.loads(line)
        fullTweet_ = data['context'] +  list(data['response'].split(" "))
        #print(len(fullTweet_))
        #print(type(fullTweet_[-maxLength:]))
        #fullTweet = ''.join(data['context'])+  data['response']
        #if len(fullTweet) > maxLength:
        #    fullTweet = fullTweet[-maxLength:]
        #fullTweet = fullTweet.replace("@USER","").replace("isn't","")
        #X.append(fullTweet)
        x_ = ''.join(fullTweet_[-maxLength:])
        #if len(x_) > maxLength:
        #    print("Len x_ :",len(x_))
        X.append(x_)
        if data['label'] == 'SARCASM':
            Y.append(1)
        else:
            Y.append(0)
            
    maxLen = len(max(X, key=len).split())
    minLen = len(min(X,key=len).split())
    #print(minLen)
    
    seed = 1337
    rng = np.random.RandomState(seed)
    rng.shuffle(X)
    rng = np.random.RandomState(seed)
    rng.shuffle(Y)
    
    validation_split = 0.2
    num_validation_samples = int(validation_split * len(X))
    train_samples = X[:-num_validation_samples]
    val_samples = X[-num_validation_samples:]
    train_labels = Y[:-num_validation_samples]
    val_labels = Y[-num_validation_samples:]
    
    #print(len(train_samples))
    #print(len(val_samples))
    #print(len(train_labels))
    #print(len(val_labels))
    
    #X_train = np.asarray(X[0:3000])
    #Y_train = np.asarray(Y[0:3000], dtype=int)
    #X_test = np.asarray(X[3000:5000])
    #Y_test = np.asarray(Y[3000:5000], dtype=int)
    X_train = np.asarray(train_samples)
    Y_train = np.asarray(train_labels, dtype=int)
    X_test = np.asarray(val_samples)
    Y_test = np.asarray(val_labels, dtype=int)
    return X_train,Y_train,X_test,Y_test,maxLen

In [11]:
X_train,Y_train,X_test,Y_test,maxLen = load_training_data('data/train.jsonl',35)
#X_train,Y_train,X_test,Y_test,maxLen = load_training_data()
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')
print(maxLen)

969


In [12]:
#print(maxLen)
#maxLen=1213
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)

In [13]:
Y_train_oh = convert_to_one_hot(Y_train, C = 2)
Y_test_oh = convert_to_one_hot(Y_test, C = 2)

In [14]:
print(maxLen)
print(len(X_train))
print(len(X_test))
print(len(Y_train))          
print(len(Y_test))
Y_train_oh

969
4000
1000
4000
1000


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [15]:
model = TextClassifier((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 969)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 969, 50)           20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 969, 128)          91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 969, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258 

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 7, batch_size = 32, shuffle=True)


Epoch 1/7
Epoch 2/7

In [None]:
model.save('FirstModel_Oct19')

In [None]:
modeltrained = keras.models.load_model('FirstModel')

In [None]:
score,acc = modeltrained.evaluate(X_test_indices, Y_test, verbose = 2, batch_size = 16)

In [None]:
print("Test accuracy = ", acc)

In [None]:
score,acc = modeltrained.evaluate(X_train_indices, Y_train_oh)

In [None]:
score,acc = modeltrained.evaluate(X_test_indices, Y_test_oh)

In [None]:
X_test[2]

In [None]:
np.count_nonzero(Y_test == 1)