In [1]:
import lib.xmlreader as xml
import lib.utils as ut

import numpy as np
import random
import gensim
from gensim.models.wrappers import FastText

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
train_docs  = xml.readXML("../database/TASS/TASS2018/task1-Training.xml"   ,[0,1,2,3])
dev_docs    = xml.readXML("../database/TASS/TASS2018/task1-Development.xml",[0,1,2,3])
test_docs   = xml.readXML("../database/TASS/TASS2018/task1-Test.xml"       ,[0,1,2,3])

In [3]:
#assert (len(train_docs) == 1008)
#assert (len(dev_docs)   ==  506)
#assert (len(test_docs)  == 1899)
print(len(test_docs))
assert (len(train_docs) == 1000)
assert (len(dev_docs)   ==  500)
assert (len(test_docs)  == 1428)

1428


In [4]:
test_docs[200].polarity

In [5]:
train_labels = []
for train_doc in train_docs:
    train_labels.append(train_doc.polarity)
    
dev_labels   = []
for dev_doc in dev_docs:
    dev_labels.append(dev_doc.polarity)

In [6]:
POSI_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEGA_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEUT_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NONE_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

In [7]:
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_train_docs),
                 len(NEGA_train_docs),
                 len(NEUT_train_docs),
                 len(NONE_train_docs)))

Positive Sentences = 242
Negative Sentences = 231
Neutral  Sentences = 166
None Values        = 361


In [8]:
minSentLvl = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Minimum number of sentences per level : ', minSentLvl)

Minimum number of sentences per level :  166


In [9]:
new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    new_train_docs.append(level_per[:minSentLvl])

In [10]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_train_docs[0]),
                 len(new_train_docs[1]),
                 len(new_train_docs[2]),
                 len(new_train_docs[3])))

New size of sentences:

Positive Sentences = 166
Negative Sentences = 166
Neutral  Sentences = 166
None Values        = 166


In [11]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

assert (len(shuf_train_docs) == 4 * minSentLvl)
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  664


In [12]:
corpus = []
for doc in shuf_train_docs + dev_docs + test_docs:
    corpus.append(doc.content)

In [13]:
print("Sentences = ", (len(test_docs + dev_docs + shuf_train_docs)))

Sentences =  2592


In [14]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)
    
assert (len(shuf_train_labels) == len(shuf_train_docs))

In [15]:
def gensim_load_vec(path="../database/embeddings/cc.es.300.bin"):
    gensim_emb = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    vocab = gensim_emb.index2word
    vec = gensim_emb.syn0
    shape = gensim_emb.syn0.shape
    return gensim_emb, vec, shape, vocab

def gensim_load_fasttext(path="../database/embeddings/cc.es.300.bin"):
    gensim_emb = FastText.load_fasttext_format(path)
    return gensim_emb

In [16]:
# gensim_emb, vec, shape, vocab = gensim_load_vec()
gensim_emb = gensim_load_fasttext()

In [17]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [18]:
X = counter.fit_transform(corpus)
print(X.shape)

(2592, 8059)


In [19]:
VOCAB_SIZE = X.shape[1]

caption_texts = corpus
Xc = counter.fit_transform(caption_texts).todense().astype("float")
print(Xc.shape)

(2592, 8059)


In [20]:
sent_lens = np.sum(Xc, axis=1).astype("float")
sent_lens[sent_lens == 0] = 1e-14
print(sent_lens.shape)

(2592, 1)


In [21]:
embedding_matrix = np.zeros((VOCAB_SIZE, 300), np.float)

for word in list(counter.vocabulary_.keys()):
    try:
        i = counter.vocabulary_[word]
        embedding_matrix[i] = gensim_emb[word]
    except KeyError:
        pass

In [22]:
np.array_equal(embedding_matrix[counter.vocabulary_['hola']], gensim_emb['hola'])

True

In [36]:
gensim_emb['hola'].shape

(300,)

In [23]:
Xb = np.divide(np.dot(Xc, embedding_matrix), sent_lens)
print(Xb.shape)

(2592, 300)


In [24]:
train_tweets = []
train_labels = shuf_train_labels
for doc in shuf_train_docs:
    train_tweets.append(doc.content)

dev_tweets = []
for doc in dev_docs:
    dev_tweets.append(doc.content)

test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [25]:
Xtrain = Xb[0:len(train_tweets)]
Xdev   = Xb[ len(train_tweets):len(train_tweets) + len(dev_tweets)]
Xtest  = Xb[-len(test_tweets):]

ytrain = np.array(train_labels)
ydev   = np.array(dev_labels)

print('Shape of data train tensor:', Xtrain.shape)
print('Shape of data val  tensor :', Xdev.shape)
print('Shape of data test  tensor:', Xtest.shape)

print('Shape of data train labels:', len(train_labels))
print('Shape of data val   labels:', len(dev_labels))

Shape of data train tensor: (664, 300)
Shape of data val  tensor : (500, 300)
Shape of data test  tensor: (1428, 300)
Shape of data train labels: 664
Shape of data val   labels: 500


In [26]:
import tensorflow as tf
import keras 

from keras.utils import to_categorical
from keras import backend as K

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D

from keras.callbacks import ModelCheckpoint

print(K.tensorflow_backend._get_available_gpus())


config = tf.ConfigProto(intra_op_parallelism_threads=4, \
                        inter_op_parallelism_threads=4, \
                        allow_soft_placement=True,\
                        device_count = {'CPU' : 1, 'GPU' : 0})

sess = tf.Session(config=config)
K.set_session(sess)
    
seed = 7

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']


In [27]:
tweet_encoder   = Input(shape=(300,), dtype='float32')

merged = Dense(256)(tweet_encoder)
merged = Dropout(0.5)(merged)

merged = Dense(128)(tweet_encoder)
merged = Dropout(0.5)(merged)

merged = Dense(64)(tweet_encoder)
merged = Dropout(0.2)(merged)

merged = Dense(4)(merged)
output = Activation('softmax')(merged)
model  = Model(inputs=[tweet_encoder], outputs=[output])

model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                19264     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
_________________________________________________________________
activation_1 (Activation)    (None, 4)                 0         
Total params: 19,524
Trainable params: 19,524
Non-trainable params: 0
_________________

In [29]:
filepath="model/best_weights_SIF.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(np.concatenate((Xtrain,Xdev)), to_categorical(np.concatenate((ytrain,ydev))), batch_size=64, epochs=50,
                     validation_data=(Xdev, to_categorical(ydev)), callbacks = [checkpoint], verbose=1)

Train on 1164 samples, validate on 500 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.48000, saving model to model/best_weights_SIF.01-0.4800.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.48000 to 0.50400, saving model to model/best_weights_SIF.02-0.5040.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.50400 to 0.51000, saving model to model/best_weights_SIF.03-0.5100.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.51000 to 0.51000, saving model to model/best_weights_SIF.04-0.5100.hdf5
Epoch 5/50

Epoch 00005: val_acc improved from 0.51000 to 0.52000, saving model to model/best_weights_SIF.05-0.5200.hdf5
Epoch 6/50

Epoch 00006: val_acc improved from 0.52000 to 0.53000, saving model to model/best_weights_SIF.06-0.5300.hdf5
Epoch 7/50

Epoch 00007: val_acc improved from 0.53000 to 0.53400, saving model to model/best_weights_SIF.07-0.5340.hdf5
Epoch 8/50

Epoch 00008: val_acc improved from 0.53400 to 0.54400, saving model to model/best_weights_SIF.08-

<keras.callbacks.History at 0x215a6534e48>

In [30]:
from keras.models import load_model

best_model = load_model('model/best_weights_SIF.42-0.6220.hdf5')

In [31]:
test_values = np.argmax(best_model.predict(Xtest), axis = 1)

In [32]:
len(test_values)

1428

In [33]:
def getLabel(num):
    if num == 0:
        return 'N'
    elif num == 1:
        return 'P'
    elif num == 2:
        return 'NEU'
    elif num == 3:
        return 'NONE'

In [34]:
import xml.etree.ElementTree as ET

def putTestValue(xmlFIle, out):
    tree = ET.parse(xmlFIle)
    root = tree.getroot()

    tweets = []
    file = open(out,"w") 
    print(len(test_values))
    for i,tweet in enumerate(root.iter('tweet')): 
        #print(i)
        val = getLabel(test_values[i])
        ID = tweet.find('tweetid').text
        file.write(ID + "\t" + val + "\n")
    file.close() 

In [35]:
putTestValue("../database/TASS/TASS2018/task1-Test.xml", "output20180408-1.txt")

1428
