In [48]:
import lib.xmlreader as xml
import lib.utils as ut

import numpy as np
import random
import gensim

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

In [130]:
train_docs  = xml.readXML("../database/TASS/TASS2018/task1-Training.xml"   ,[0,1,2,3])
dev_docs    = xml.readXML("../database/TASS/TASS2018/task1-Development.xml",[0,1,2,3])
test_docs   = xml.readXML("../database/TASS/TASS2018/task1-Test.xml"       ,[0,1,2,3])

In [131]:
#assert (len(train_docs) == 1008)
#assert (len(dev_docs)   ==  506)
#assert (len(test_docs)  == 1899)

assert (len(train_docs) == 1000)
assert (len(dev_docs)   ==  500)
assert (len(test_docs)  == 1428)

In [132]:
train_labels = []
for train_doc in train_docs:
    train_labels.append(train_doc.polarity)
    
dev_labels   = []
for dev_doc in dev_docs:
    dev_labels.append(dev_doc.polarity)

In [133]:
POSI_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEGA_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEUT_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NONE_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

In [134]:
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_train_docs),
                 len(NEGA_train_docs),
                 len(NEUT_train_docs),
                 len(NONE_train_docs)))

Positive Sentences = 242
       Negative Sentences = 231
       Neutral  Sentences = 166
       None Values        = 361


In [135]:
minSentLvl = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Minimum number of sentences per level : ', minSentLvl)

Minimum number of sentences per level :  166


In [136]:
new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    new_train_docs.append(level_per[:minSentLvl])

In [137]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_train_docs[0]),
                 len(new_train_docs[1]),
                 len(new_train_docs[2]),
                 len(new_train_docs[3])))

New size of sentences:

Positive Sentences = 166
       Negative Sentences = 166
       Neutral  Sentences = 166
       None Values        = 166


In [138]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

assert (len(shuf_train_docs) == 4 * minSentLvl)
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  664


In [139]:
corpus = []
for doc in shuf_train_docs + dev_docs + test_docs:
    corpus.append(doc.content)

In [140]:
print("Sentences = ", (len(test_docs + dev_docs + shuf_train_docs)))

Sentences =  2592


In [141]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)
    
assert (len(shuf_train_labels) == len(shuf_train_docs))

In [142]:
def gensim_load_vec(path="../database/embeddings/SBW-vectors-300-min5.bin"):
    gensim_emb =  gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    vocab = gensim_emb.index2word
    vec = gensim_emb.syn0
    shape = gensim_emb.syn0.shape
    return gensim_emb, vec, shape, vocab

In [38]:
gensim_emb, vec, shape, vocab = gensim_load_vec()

  after removing the cwd from sys.path.
  """


In [143]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [144]:
X = counter.fit_transform(corpus)
print(X.shape)

(2592, 8026)


In [None]:

VOCAB_SIZE = X.shape[1]

caption_texts = corpus
Xc = counter.fit_transform(caption_texts).todense().astype("float")
print(Xc.shape)

In [145]:
VOCAB_SIZE = X.shape[1]

embedding_matrix = np.zeros((VOCAB_SIZE, 300), np.float)

for word in list(counter.vocabulary_.keys()):
    try:
        i = counter.vocabulary_[word]
        embedding_matrix[i] = gensim_emb[word]
    except KeyError:
        pass

In [146]:
np.array_equal(embedding_matrix[counter.vocabulary_['hola']], gensim_emb['hola'])

True

In [147]:
train_tweets = []
train_labels = shuf_train_labels
for doc in shuf_train_docs:
    train_tweets.append(doc.content)

dev_tweets = []
for doc in dev_docs:
    dev_tweets.append(doc.content)

test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [148]:
ls = []
sequences = []
for tweet in train_tweets + dev_tweets + test_tweets:
    sentence = []
    for word in ut.tokenizer(tweet):
        try:
            i = counter.vocabulary_[word]
            sentence.append(i)
        except KeyError:
            pass
    
    sequences.append(sentence)
    ls.append(len(sentence))
    
MAXLEN = max(ls)
print(MAXLEN)

30


In [149]:
MAXLEN = 30

In [150]:
from keras.preprocessing.sequence import pad_sequences

x_train_seq = pad_sequences(sequences[:len(train_tweets)], maxlen = MAXLEN)
x_dev_seq   = pad_sequences(sequences[ len(train_tweets): len(train_tweets) + len(dev_tweets)], maxlen=MAXLEN)
x_test_seq  = pad_sequences(sequences[-len(test_tweets):], maxlen=MAXLEN)

print('Shape of data train tensor:', x_train_seq.shape)
print('Shape of data val  tensor :', x_dev_seq.shape)
print('Shape of data test  tensor:', x_test_seq.shape)

print('Shape of data train labels:', len(train_labels))
print('Shape of data val   labels:', len(dev_labels))

Shape of data train tensor: (664, 30)
Shape of data val  tensor : (500, 30)
Shape of data test  tensor: (1428, 30)
Shape of data train labels: 664
Shape of data val   labels: 500


In [151]:
import tensorflow as tf
import keras 

from keras.utils import to_categorical
from keras import backend as K

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D

from keras.callbacks import ModelCheckpoint

print(K.tensorflow_backend._get_available_gpus())


config = tf.ConfigProto(intra_op_parallelism_threads=4, \
                        inter_op_parallelism_threads=4, \
                        allow_soft_placement=True,\
                        device_count = {'CPU' : 1, 'GPU' : 0})

sess = tf.Session(config=config)
K.set_session(sess)
    
seed = 7

['/job:localhost/replica:0/task:0/device:GPU:0']


In [154]:
tweet_input     = Input(shape=(MAXLEN,), dtype='int32')

tweet_encoder   = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=MAXLEN, trainable=False)(tweet_input)

bigram_branch   = Conv1D(filters=100,kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch   = GlobalMaxPooling1D()(bigram_branch)

trigram_branch  = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch  = GlobalMaxPooling1D()(trigram_branch)

fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)

merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)
merged = Dense(32, activation='relu')(merged)
merged = Dropout(0.5)(merged)

merged = Dense(4)(merged)
output = Activation('softmax')(merged)
model  = Model(inputs=[tweet_input], outputs=[output])

model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 30, 300)      2407800     input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 29, 100)      60100       embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 28, 100)      90100       embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_18 

In [155]:
filepath="model/CNN_best_weights_CNN.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, to_categorical(train_labels), batch_size=64, epochs=10,
                     validation_data=(x_dev_seq, to_categorical(dev_labels)), callbacks = [checkpoint], verbose=1)

Train on 664 samples, validate on 500 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.46600, saving model to model/CNN_best_weights_CNN.01-0.4660.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.46600
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.46600
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.46600
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.46600
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.46600
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.46600
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.46600
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.46600
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.46600


<keras.callbacks.History at 0x7fc046f705c0>

In [156]:
from keras.models import load_model

best_model = load_model('model/CNN_best_weights_CNN.01-0.4660.hdf5')

In [157]:
test_values = np.argmax(best_model.predict(x_test_seq), axis = 1)

In [158]:
len(test_values)

1428

In [159]:
def getLabel(num):
    if num == 0:
        return 'N'
    elif num == 1:
        return 'P'
    elif num == 2:
        return 'NEU'
    elif num == 3:
        return 'NONE'

In [160]:
import xml.etree.ElementTree as ET

def putTestValue(xmlFIle, out):
    tree = ET.parse(xmlFIle)
    root = tree.getroot()

    tweets = []
    file = open(out,"w") 
    print(len(test_values))
    for i,tweet in enumerate(root.iter('tweet')): 
        #print(i)
        val = getLabel(test_values[i])
        ID = tweet.find('tweetid').text
        file.write(ID + "\t" + val + "\n")
    file.close() 

In [162]:
putTestValue("../database/TASS/TASS2018/task1-Test.xml", "output2018.txt")

1428
