In [1]:
import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import gensim.models.word2vec

In [2]:
train_docs = xml.readXML("../database/TASS/TASS2018/task1-Training.xml",[0,1,2,3])
val_docs   = xml.readXML("../database/TASS/TASS2018/task1-Development.xml",[0,1,2,3])

In [3]:
train_tweets = []
train_labels = []
for doc in train_docs:
    # train_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    train_tweets.append(doc.content)
    train_labels.append(doc.polarity)

val_tweets = []
val_labels = []
for doc in val_docs:
    # test_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    val_tweets.append(doc.content)
    val_labels.append(doc.polarity)

In [4]:
len(train_tweets), len(val_tweets)

(1000, 500)

In [5]:
POSI_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEGA_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEUT_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NONE_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(POSI_train_docs),
                 len(NEGA_train_docs),
                 len(NEUT_train_docs),
                 len(NONE_train_docs)))

Positive Sentences = 242
       Negative Sentences = 231
       Neutral  Sentences = 166
       None Values        = 361


In [6]:
minSentLvl = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))

print('Minimum number of sentences per level : ', minSentLvl)

Minimum number of sentences per level :  166


In [7]:
import random

new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    new_train_docs.append(level_per[:minSentLvl])

In [8]:
print("New size of sentences:\n")
fmt = """Positive Sentences = {:d}
       \rNegative Sentences = {:d}
       \rNeutral  Sentences = {:d}
       \rNone Values        = {:d}"""

print(fmt.format(len(new_train_docs[0]),
                 len(new_train_docs[1]),
                 len(new_train_docs[2]),
                 len(new_train_docs[3])))

New size of sentences:

Positive Sentences = 166
       Negative Sentences = 166
       Neutral  Sentences = 166
       None Values        = 166


In [9]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))

assert (len(shuf_train_docs) == 4 * minSentLvl)
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  664


In [10]:
corpus = []
for doc in shuf_train_docs + val_docs:
    corpus.append(doc.content)

In [11]:
print("Sentences = ", (len(val_docs + shuf_train_docs)))

Sentences =  1164


In [12]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)
    
assert (len(shuf_train_labels) == len(shuf_train_docs))

In [13]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [14]:
X = counter.fit_transform(corpus)
print(X.shape)

(1164, 4574)


In [15]:
test_docs   = xml.readXMLTest("../database/TASS/TASS2018/task1-Test.xml")

In [16]:
test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [17]:
assert (len(test_tweets) == 1428)

In [59]:
sequences = []
for tweet in corpus + test_tweets:
    sentence = []
    for word in ut.tokenizer(tweet):
        sentence.append(word)
    sequences.append(sentence)

In [60]:
assert (len(sequences) == (len(shuf_train_docs) + len(val_docs) + len(test_tweets)))

In [61]:
from collections import Counter

In [64]:
cnt = Counter(word for doc in sequences for word in doc )

In [65]:
cnt['felicidad'], len(cnt)

(11, 8025)

In [66]:
cnt.most_common()[-1]

('activacion', 1)

In [67]:
cnt.most_common()[:10]

[('que', 1451),
 ('de', 1222),
 ('y', 1034),
 ('a', 913),
 ('no', 855),
 ('la', 802),
 ('me', 730),
 ('el', 722),
 ('en', 661),
 ('es', 565)]

In [68]:
import collections

max_vocab = len(cnt)
min_freq = 0
#max_vocab = len(cnt)
#max_vocab = 4000
#min_freq  = 0


itos = [o for o,c in cnt.most_common(max_vocab) if c >= min_freq]

itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

8027

In [69]:
print(len(cnt))

8025


In [70]:
len(stoi)

8027

In [71]:
import pickle

In [72]:
pickle.dump(itos, open('../database/ulmfit/tmp/itos_'+'2018'+'.pkl', 'wb'))

In [73]:
itos      = pickle.load(open('../database/ulmfit/tmp/itos_'+'2018'+'.pkl', 'rb'))

In [74]:
data_ids = [[stoi[o] for o in p] for p in sequences]

In [75]:
assert (len(sequences) == len(data_ids))

In [76]:
x_train_ids = np.array(data_ids[:len(shuf_train_labels)])
x_val_ids   = np.array(data_ids[len(shuf_train_labels):len(shuf_train_labels)+len(val_docs)])
x_test_ids  = np.array(data_ids[(len(shuf_train_labels)+len(val_docs)):])

print('Shape of data train tensor:', x_train_ids.shape)
print('Shape of data val  tensor:', x_val_ids.shape)
print('Shape of data test  tensor:', x_test_ids.shape)

Shape of data train tensor: (664,)
Shape of data val  tensor: (500,)
Shape of data test  tensor: (1428,)


In [77]:
SAVE_PATH = '../database/ulmfit/'
TASS_     = '2018'
np.save(SAVE_PATH + "tmp/train_labels_"+TASS_+".npy",shuf_train_labels)
np.save(SAVE_PATH + "tmp/val_labels_"+TASS_+".npy",val_labels)

In [78]:
np.save(SAVE_PATH + "tmp/train_ids_"+TASS_+".npy",x_train_ids)
np.save(SAVE_PATH + "tmp/val_ids_"+TASS_+".npy",x_val_ids)
np.save(SAVE_PATH + "tmp/test_ids_"+TASS_+".npy",x_test_ids)

In [63]:
sequences[0]

['ya',
 'me',
 'imagino',
 'su',
 'calor',
 'que',
 'debe',
 'estar',
 'haciendo',
 'alla',
 'ya',
 'me',
 'imagino',
 'negr']

In [79]:
itos

['_unk_',
 '_pad_',
 'que',
 'de',
 'y',
 'a',
 'no',
 'la',
 'me',
 'el',
 'en',
 'es',
 'mi',
 'lo',
 'un',
 'se',
 'por',
 'con',
 'te',
 'los',
 'pero',
 'ya',
 'para',
 'si',
 'una',
 'yo',
 'las',
 'mas',
 'como',
 'todo',
 'o',
 'al',
 'tu',
 'solo',
 'del',
 'esta',
 'cuando',
 'quiero',
 'dia',
 'porque',
 'mejor',
 'estoy',
 'hoy',
 'muy',
 'tengo',
 'feliz',
 'eso',
 'bueno',
 'q',
 'mis',
 'su',
 'bien',
 'siempre',
 'le',
 'ser',
 'ahora',
 'extra',
 'todos',
 'sin',
 'este',
 'gracias',
 'ver',
 'hay',
 'asi',
 'buen',
 'os',
 'hace',
 'tan',
 'son',
 'vida',
 'fue',
 'ni',
 'soy',
 'ao',
 'nos',
 'ma',
 'hacer',
 'nada',
 'mucho',
 'hasta',
 'eres',
 'ir',
 'dias',
 'he',
 'estar',
 'buena',
 'algo',
 'ana',
 'igual',
 'sea',
 'desde',
 'cosas',
 'tiempo',
 'nuevo',
 'vez',
 'tambien',
 'peru',
 'voy',
 'ese',
 'sus',
 'espero',
 'tus',
 'aun',
 'ti',
 'esa',
 'semana',
 'estas',
 'va',
 'creo',
 'sera',
 'donde',
 'cada',
 'puedo',
 'hola',
 'mal',
 'menos',
 'siento',
