In [1]:
import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import gensim.models.word2vec

In [2]:
train_docs = xml.readXML("../database/TASS/TASS2017/task1-Training.xml",[0,1,2,3])
val_docs   = xml.readXML("../database/TASS/TASS2017/task1-Development.xml",[0,1,2,3])

In [3]:
train_tweets = []
train_labels = []
for doc in train_docs:
    # train_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    train_tweets.append(doc.content)
    train_labels.append(doc.polarity)

val_tweets = []
val_labels = []
for doc in val_docs:
    # test_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    val_tweets.append(doc.content)
    val_labels.append(doc.polarity)

In [4]:
len(train_tweets), len(val_tweets)

(1008, 506)

In [5]:
POSI_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEGA_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEUT_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NONE_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_train_docs),
                 len(NEGA_train_docs),
                 len(NEUT_train_docs),
                 len(NONE_train_docs)))

Positive Sentences = 418
       Negative Sentences = 318
       Neutral  Sentences = 133
       None Values        = 139


In [6]:
minSentLvl = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Minimum number of sentences per level : ', minSentLvl)

Minimum number of sentences per level :  133


In [7]:
import random

new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    new_train_docs.append(level_per[:minSentLvl])

In [8]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_train_docs[0]),
                 len(new_train_docs[1]),
                 len(new_train_docs[2]),
                 len(new_train_docs[3])))

New size of sentences:

Positive Sentences = 133
       Negative Sentences = 133
       Neutral  Sentences = 133
       None Values        = 133


In [9]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

assert (len(shuf_train_docs) == 4 * minSentLvl)
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  532


In [10]:
corpus = []
for doc in shuf_train_docs + val_docs:
    corpus.append(doc.content)

In [11]:
print("Sentences = ", (len(val_docs + shuf_train_docs)))

Sentences =  1038


In [12]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)
    
assert (len(shuf_train_labels) == len(shuf_train_docs))

In [13]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [14]:
X = counter.fit_transform(corpus)
print(X.shape)

(1038, 3976)


In [15]:
test_docs   = xml.readXMLTest("../database/TASS/TASS2017/task1-Test.xml")

In [16]:
test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [17]:
assert (len(test_tweets) == 1899)

In [18]:
sequences = []
for tweet in corpus + test_tweets:
    sentence = []
    for word in ut.tokenizer(tweet):
        sentence.append(word)
    sequences.append(sentence)

In [19]:
assert (len(sequences) == (len(shuf_train_docs) + len(val_docs) + len(test_tweets)))

In [20]:
from collections import Counter

In [21]:
cnt = Counter(word for doc in sequences for word in doc )

In [22]:
cnt['felicidad'], len(cnt)

(8, 8333)

In [23]:
cnt.most_common()[-1]

('agobio', 1)

In [24]:
cnt.most_common()[:10]

[('que', 1720),
 ('de', 1518),
 ('y', 1042),
 ('a', 986),
 ('no', 914),
 ('la', 904),
 ('el', 823),
 ('me', 811),
 ('en', 793),
 ('es', 743)]

In [25]:
import collections

max_vocab = len(cnt)
min_freq = 0
#max_vocab = len(cnt)
#max_vocab = 4000
#min_freq  = 0


itos = [o for o,c in cnt.most_common(max_vocab) if c >= min_freq]

itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

8335

In [26]:
print(len(cnt))

8333


In [27]:
len(stoi)

8335

In [41]:
import pickle

TASS_     = '2017'

In [34]:
pickle.dump(itos, open('../database/ulmfit/tmp/itos_'+TASS_+'.pkl', 'wb'))

In [35]:
itos      = pickle.load(open('../database/ulmfit/tmp/itos_'+TASS_+'.pkl', 'rb'))

In [36]:
data_ids = [[stoi[o] for o in p] for p in sequences]

In [37]:
assert (len(sequences) == len(data_ids))

In [38]:
x_train_ids = np.array(data_ids[:len(shuf_train_labels)])
x_val_ids   = np.array(data_ids[len(shuf_train_labels):len(shuf_train_labels)+len(val_docs)])
x_test_ids  = np.array(data_ids[(len(shuf_train_labels)+len(val_docs)):])

print('Shape of data train tensor:', x_train_ids.shape)
print('Shape of data val  tensor:', x_val_ids.shape)
print('Shape of data test  tensor:', x_test_ids.shape)

Shape of data train tensor: (532,)
Shape of data val  tensor: (506,)
Shape of data test  tensor: (1899,)


In [42]:
SAVE_PATH = '../database/ulmfit/'

np.save(SAVE_PATH + "tmp/train_labels_"+TASS_+".npy",shuf_train_labels)
np.save(SAVE_PATH + "tmp/val_labels_"+TASS_+".npy",val_labels)

In [43]:
np.save(SAVE_PATH + "tmp/train_ids_"+TASS_+".npy",x_train_ids)
np.save(SAVE_PATH + "tmp/val_ids_"+TASS_+".npy",x_val_ids)
np.save(SAVE_PATH + "tmp/test_ids_"+TASS_+".npy",x_test_ids)

In [44]:
sequences[0]

['lo',
 'mismo',
 'puedo',
 'decir',
 'de',
 'tu',
 'fanart',
 'y',
 'por',
 'cierto',
 'tu',
 'comic',
 'muy',
 'divertido']

In [45]:
itos

['_unk_',
 '_pad_',
 'que',
 'de',
 'y',
 'a',
 'no',
 'la',
 'el',
 'me',
 'en',
 'es',
 'lo',
 'un',
 'por',
 'pero',
 'se',
 'si',
 'los',
 'mi',
 'con',
 'una',
 'ya',
 'las',
 'para',
 'te',
 'yo',
 'mas',
 'al',
 'como',
 'muy',
 'esta',
 'tengo',
 'estoy',
 'he',
 'del',
 'gracias',
 'tu',
 'ha',
 'dia',
 'porque',
 'cuando',
 'o',
 'os',
 'ver',
 'soy',
 'son',
 'eso',
 'todo',
 'solo',
 'asi',
 'quiero',
 'hay',
 'le',
 'bueno',
 'bien',
 'tan',
 'ahora',
 'hoy',
 'nos',
 'mejor',
 'ma',
 'vez',
 'ser',
 'hacer',
 'q',
 'dias',
 'sin',
 'voy',
 'todos',
 'pues',
 'ana',
 'este',
 'ni',
 'hace',
 'nada',
 'va',
 'menos',
 'mucho',
 'poco',
 'algo',
 'tambien',
 'muchas',
 'vida',
 'feliz',
 'ir',
 'cosas',
 'gente',
 'eres',
 'siempre',
 'tiene',
 'buen',
 'mal',
 'dos',
 'puedo',
 'mismo',
 'tio',
 'tener',
 'buena',
 'aqui',
 'verdad',
 'mis',
 'alguien',
 'ese',
 'da',
 'genial',
 'gran',
 'su',
 'noche',
 'aunque',
 'esa',
 'esto',
 'buenos',
 'espero',
 'creo',
 'estas',
 