In [1]:
import numpy as np
import matplotlib.pyplot as plt
import emoji
import pandas as pd

In [2]:
import csv
def read_csv(filename = 'data/emojify_data.csv'):
    phrase = []
    emoji = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            emoji.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)

    return X, Y

In [3]:
X_train,Y_train=read_csv('data/train_emoji.csv')
X_test,Y_test=read_csv('data/tesss.csv')

In [4]:
len(X_train), len(Y_train)

(132, 132)

In [5]:
len(X_test), len(Y_test)

(56, 56)

In [6]:
maxLen=len(max(X_train, key=len).split())

In [7]:
maxLen

10

In [8]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)


In [14]:
index=3
print(X_train[index], end=" ")
print(label_to_emoji(Y_train[index]))

Miss you so much ❤️


In [15]:
index=100
print(X_train[index], end=" ")
print(label_to_emoji(Y_train[index]))

I love my dad ❤️


In [16]:
index=101
print(X_train[index], end=" ")
print(label_to_emoji(Y_train[index]))

this guy was such a joke 😄


In [17]:
def convert_to_one_hot(X,C):
    return np.eye(C)[X.reshape(-1)]

In [18]:
Y_oh_train=convert_to_one_hot(Y_train,C=5)
Y_oh_test=convert_to_one_hot(Y_test,C=5)

In [19]:
Y_oh_train.shape, Y_oh_test.shape

((132, 5), (56, 5))

In [20]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

# words=List of word in the vocabulary
# words_to_index, index_to_words= dictionary for index and word
# word_to_vec_map= vector representation for the word

In [21]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [22]:
word='cucumber'
index=285565
print('The index of the word "{}" is {}.'.format(word,word_to_index[word]))
print('The word having index {} is {}'.format(index,index_to_word[index]))

The index of the word "cucumber" is 113317.
The word having index 285565 is pisz


In [24]:
# Model=> All words of the sentence are converted into vectors and then average is taken. Softmax is applied on 
# that average and Class is classified.

def average_of_sentence(sentence,word_to_vec_map):
    
    words=list(sentence.split())
    words=[x.lower() for x in words]
    
    word1=list(word_to_vec_map.keys())[0]
    avg=np.zeros(word_to_vec_map[word1].shape)
    
    for w in words:
        avg+=word_to_vec_map[w]
    avg=avg/len(words)
    
    return avg

In [25]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [26]:
def predict(X, Y, W, b, word_to_vec_map):
    """
    Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
    
    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
    
    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    
    for j in range(m):                       # Loop over training examples
        
        # Split jth test example (sentence) into list of lower case words
        words = X[j].lower().split()
        
        # Average words' vectors
        avg = np.zeros((50,))
        for w in words:
            avg += word_to_vec_map[w]
        avg = avg/len(words)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    
    return pred

In [31]:
# z=W*avg+b
# a=softmax(z)
# loss=sum(Y_oh*log(a))

def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
    np.random.seed(1)
    m=Y.shape[0]     # Number of training examples
    n_y=5            # Number of classes
    n_h=50           # Dimensions of Glove vectors
    
    W=np.random.randn(n_y,n_h)/np.sqrt(n_h)
    b=np.zeros(n_y,)
    
    Y_oh=convert_to_one_hot(Y,C=5)
    
    # Optimization function
    for t in range(num_iterations):
        cost=0
        for i in range(m):
            avg=average_of_sentence(X[i],word_to_vec_map)
            
            # Forward Propagation
            z=np.dot(W,avg)+b
            a=softmax(z)
            
            # Loss calculations
            cost+=-np.sum(np.multiply(Y_oh[i],np.log(a)))
            
            # Backpropagation
            dz=a-Y_oh[i]
            dW=np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz
            
            # Updating Parameters
            W=W - learning_rate*dW
            b=b - learning_rate*db
            
        cost=cost/m
        
        if t%100==0:
            print('Cost after iterations {} is {}'.format(t,cost))
            pred= predict(X, Y, W, b, word_to_vec_map)
    
    return pred,W,b
        
    

In [32]:
pred,W,b=model(X_train, Y_train, word_to_vec_map)

Cost after iterations 0 is 1.6461697922094756
Accuracy: 0.3484848484848485
Cost after iterations 100 is 0.3967869591827299
Accuracy: 0.9318181818181818
Cost after iterations 200 is 0.287864684355277
Accuracy: 0.9545454545454546
Cost after iterations 300 is 0.2372234531131005
Accuracy: 0.9696969696969697


In [33]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)

Training set:
Accuracy: 0.9772727272727273
Test set:
Accuracy: 0.8571428571428571
