# Sentiment Analysis usando Deep Learning para espaÃ±ol en textos cortos

# 1. Data Cleaning 

We work on a dataset of tweets labelled as positive or negative, more information about the dataset 
can be found on http://www.sepln.org/workshops/tass/. 

# 2. Vocabulary

## 2.2 Getting train, validation and test subsets

In [1]:
import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import gensim.models.word2vec



In [2]:
train_docs = xml.readXML("../database/TASS/TASS2018/task1-Training.xml",[0,1,2,3])
val_docs   = xml.readXML("../database/TASS/TASS2018/task1-Development.xml",[0,1,2,3])

In [3]:
train_tweets = []
train_labels = []
for doc in train_docs:
    # train_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    train_tweets.append(doc.content)
    train_labels.append(doc.polarity)

val_tweets = []
val_labels = []
for doc in val_docs:
    # test_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    val_tweets.append(doc.content)
    val_labels.append(doc.polarity)

In [4]:
len(train_tweets), len(val_tweets)

(1000, 500)

In [5]:
POSI_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEGA_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEUT_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NONE_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_train_docs),
                 len(NEGA_train_docs),
                 len(NEUT_train_docs),
                 len(NONE_train_docs)))

Positive Sentences = 242
Negative Sentences = 231
Neutral  Sentences = 166
None Values        = 361


In [6]:
minSentLvl = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Minimum number of sentences per level : ', minSentLvl)

Minimum number of sentences per level :  166


In [7]:
maxSentLvl = max(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Maximum number of sentences per level : ', maxSentLvl)

Maximum number of sentences per level :  361


In [8]:
maxSentLvl = max(len(POSI_train_docs), len(NEGA_train_docs),)

In [9]:
import random

new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    
    if (maxSentLvl > len(level_train_docs[i])):
        new_train_docs.append(level_train_docs[i] + level_per[: (maxSentLvl -len(level_train_docs[i]))])
    else:
        new_train_docs.append(level_per[:maxSentLvl])

In [10]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_train_docs[0]),
                 len(new_train_docs[1]),
                 len(new_train_docs[2]),
                 len(new_train_docs[3])))

New size of sentences:

Positive Sentences = 242
Negative Sentences = 242
Neutral  Sentences = 242
None Values        = 242


In [11]:
#flat_train_docs = [item for sublist in new_train_docs for item in sublist]
#shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

#assert (len(shuf_train_docs) == 4 * minSentLvl)
#print("shuf_train_docs size = ", len(shuf_train_docs))

In [12]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

assert (len(shuf_train_docs) == 4 * maxSentLvl)
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  968


In [13]:
corpus = []
for doc in shuf_train_docs + val_docs:
    corpus.append(doc.content)

In [14]:
print("Sentences = ", (len(shuf_train_docs + val_docs)))

Sentences =  1468


In [15]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)
    
assert (len(shuf_train_labels) == len(shuf_train_docs))

# 3. Preparing Data

In [16]:
from gensim.models import KeyedVectors

def gensim_load_vec(path="../database/embeddings/cc.es.300.bin"):
    #use gensim_emb.wv.index2word if used this way to load vectors
    #gensim_emb = gensim.models.word2vec.Word2Vec.load(path)
    gensim_emb =  gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    vocab = gensim_emb.index2word
    vec = gensim_emb.syn0
    shape = gensim_emb.syn0.shape
    return gensim_emb, vec, shape, vocab

from gensim.models.wrappers import FastText

def gensim_load_fasttext(path="../database/embeddings/cc.es.300.bin"):
    gensim_emb = FastText.load_fasttext_format(path)
    return gensim_emb

In [19]:
# gensim_emb, vec, shape, vocab = gensim_load_vec()
gensim_emb = gensim_load_fasttext()

In [20]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [21]:
X = counter.fit_transform(corpus)
print(X.shape)

(1468, 5213)


In [23]:
VOCAB_SIZE = X.shape[1]
embedding_matrix = np.ones((VOCAB_SIZE, 300))
for word in list(counter.vocabulary_.keys()):
    try:
        i = counter.vocabulary_[word]
        embedding_matrix[i] = gensim_emb[word]
    except KeyError:
        pass
print(embedding_matrix.shape)

(5213, 300)


In [24]:
np.array_equal(embedding_matrix[counter.vocabulary_['hola']], gensim_emb['hola'])

True

In [25]:
test_docs   = xml.readXMLTest("../database/TASS/TASS2018/task1-Test.xml")

In [26]:
test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [27]:
assert (len(test_tweets) == 1428)

In [28]:
sequences = []
maxlen    = []

for tweet in corpus + test_tweets:
    sentence = []
    for word in ut.tokenizer(tweet):
        try:
            i = counter.vocabulary_[word]
            sentence.append(i)
        except KeyError:
            pass
    maxlen.append(len(sentence))
    sequences.append(sentence)

In [29]:
print("maxlen : ", max(maxlen))
print("minlen : ", min(maxlen))

maxlen :  30
minlen :  2


In [30]:
assert (len(sequences) == (len(shuf_train_docs) + len(val_docs) + len(test_tweets)))

In [31]:
from keras.preprocessing.sequence import pad_sequences
x_train_seq = pad_sequences(sequences[:len(shuf_train_docs)], maxlen=30)
x_val_seq   = pad_sequences(sequences[len(shuf_train_docs):len(shuf_train_docs)+len(val_docs)], maxlen=30)
x_test_seq  = pad_sequences(sequences[(len(shuf_train_docs)+len(val_docs)):], maxlen=30)
print('Shape of data train tensor:', x_train_seq.shape)
print('Shape of data val  tensor:', x_val_seq.shape)
print('Shape of data test  tensor:', x_test_seq.shape)

Using TensorFlow backend.


Shape of data train tensor: (968, 30)
Shape of data val  tensor: (500, 30)
Shape of data test  tensor: (1428, 30)


# Model: Convolutional Neural Network

In [32]:
import tensorflow as tf
import keras 

from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())


config = tf.ConfigProto(intra_op_parallelism_threads=4, \
                        inter_op_parallelism_threads=4, \
                        allow_soft_placement=True,\
                        device_count = {'CPU' : 1, 'GPU' : 0})

sess = tf.Session(config=config)
K.set_session(sess)
    
seed = 0

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D

tweet_input = Input(shape=(30,), dtype='int32')

tweet_encoder   = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=30, trainable=True)(tweet_input)
bigram_branch   = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch   = GlobalMaxPooling1D()(bigram_branch)
trigram_branch  = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch  = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
#merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = concatenate([bigram_branch, trigram_branch], axis=1)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.5)(merged)

merged = Dense(4)(merged)
output = Activation('softmax')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

['/job:localhost/replica:0/task:0/device:GPU:0']
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      1563900     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 29, 100)      60100       embedding_1[0][0]                
_______________________________________________________________________

In [33]:
import random

POSI_val_docs = [val_docs[i] for i in range(len(val_labels)) if val_labels[i] == 0]
NEGA_val_docs = [val_docs[i] for i in range(len(val_labels)) if val_labels[i] == 1]
NEUT_val_docs = [val_docs[i] for i in range(len(val_labels)) if val_labels[i] == 2]
NONE_val_docs = [val_docs[i] for i in range(len(val_labels)) if val_labels[i] == 3]

level_val_docs = [POSI_val_docs,NEGA_val_docs,NEUT_val_docs,NONE_val_docs]

fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_val_docs),
                 len(NEGA_val_docs),
                 len(NEUT_val_docs),
                 len(NONE_val_docs)))



Positive Sentences = 106
Negative Sentences = 95
Neutral  Sentences = 61
None Values        = 238


In [34]:
minSentLvl = min(len(POSI_val_docs), len(NEGA_val_docs),len(NEUT_val_docs),len(NONE_val_docs))

In [35]:
new_val_docs = []
for i in range(len(level_val_docs)):
    level_per = random.sample(level_val_docs[i],len(level_val_docs[i]))
    
    new_val_docs.append(level_per[:minSentLvl])

In [36]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_val_docs[0]),
                 len(new_val_docs[1]),
                 len(new_val_docs[2]),
                 len(new_val_docs[3])))

New size of sentences:

Positive Sentences = 61
Negative Sentences = 61
Neutral  Sentences = 61
None Values        = 61


In [37]:
flat_val_docs = [item for sublist in new_val_docs for item in sublist]

In [38]:
val_corpus  = []
new_val_lab = []
for doc in flat_val_docs:
    val_corpus.append(doc.content)
    new_val_lab.append(doc.polarity)

In [39]:
seq_val = []

for tweet in val_corpus:
    sentence = []
    for word in ut.tokenizer(tweet):
        try:
            i = counter.vocabulary_[word]
            sentence.append(i)
        except KeyError:
            pass
    seq_val.append(sentence)

In [40]:
# seq_val

In [41]:
x_eq_val_seq   = pad_sequences(seq_val, maxlen=30)

In [42]:
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

filepath="model/CNN_2018_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, to_categorical(shuf_train_labels), batch_size=64, epochs=50,
                     validation_data=(x_val_seq, to_categorical(val_labels)), shuffle=True, callbacks = [checkpoint])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 968 samples, validate on 500 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.75000, saving model to model/CNN_2018_best_weights.01-0.7500.hdf5
Epoch 2/50

Epoch 00002: val_acc did not improve from 0.75000
Epoch 3/50

Epoch 00003: val_acc did not improve from 0.75000
Epoch 4/50

Epoch 00004: val_acc did not improve from 0.75000
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.75000
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.75000
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.75000
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.75000
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.75000
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.75000
Epoch 11/50

Epoch 00011: val_acc did not improve from 0.75000
Epoch 12/50

Epoch 00012: val_acc did not improve from 0.75000
Epoch 13/50

Epoch 0

<keras.callbacks.History at 0x1d5d8c790b8>

In [43]:
#np.argmax(model.predict(x_val_seq),axis=1)

In [44]:
from sklearn.metrics import f1_score

f1_score(np.argmax(model.predict(x_eq_val_seq), axis=1), new_val_lab, average='macro')

0.36501750693973667

In [45]:
model.evaluate(x=x_eq_val_seq, y=to_categorical(new_val_lab))



[0.9904576481365767, 0.702868851481891]

In [47]:
from keras.models import load_model

loaded_CNN_model = load_model('model/CNN_2018_best_weights.01-0.7500.hdf5')
loaded_CNN_model.evaluate(x=x_eq_val_seq, y=to_categorical(new_val_lab))



[0.5616409974020036, 0.75]

In [48]:
from sklearn.metrics import f1_score

f1_score(np.argmax(loaded_CNN_model.predict(x_eq_val_seq), axis=1), new_val_lab, average='macro')

  'recall', 'true', average, warn_for)


0.16721401204159825

In [None]:
#from keras.models import load_model

#best_model = load_model('model/CNN_best_weights.01-0.7500.hdf5')

In [56]:
test_values = np.argmax(loaded_CNN_model.predict(x_test_seq), axis = 1)

In [54]:
def getLabel(num):
    if num == 0:
        return 'N'
    elif num == 1:
        return 'P'
    elif num == 2:
        return 'NEU'
    elif num == 3:
        return 'NONE'

In [57]:
import xml.etree.ElementTree as ET

def putTestValue(xmlFIle, out):
    tree = ET.parse(xmlFIle)
    root = tree.getroot()

    tweets = []
    file = open(out,"w") 
    print(len(test_values))
    for i,tweet in enumerate(root.iter('tweet')): 
        #print(i)
        val = getLabel(test_values[i])
        #print(val, test_values[i])
        ID = tweet.find('tweetid').text
        file.write(ID + "\t" + val + "\n")
    file.close() 

In [58]:
#test_values = np.argmax(best_model.predict(x_test_set), axis = 1)

In [59]:
putTestValue("../database/TASS/TASS2018/task1-Test.xml", "output_cnn_alldata_20180408-2.txt")

1428
