In [1]:
import xmlreader as xml
import re
import pandas as pd
import os
import gensim
import sys
import numpy as np
import pickle
from utils import tokenize
from collections import Counter

DATA_PATH = '../database/'

In [2]:
train_docs = xml.readXML(DATA_PATH + "intertass2018-PE-train-tagged.xml")
test_docs  = xml.readXMLTest(DATA_PATH + "intertass2018-PE-test.xml" )

In [3]:
trn_text   = [t.content  for t in train_docs]
trn_label  = [t.polarity for t in train_docs]

test_text   = [t.content  for t in test_docs]
test_label  = [t.polarity for t in test_docs]

In [4]:
col_names = ['labels','text']

In [5]:
df_train = pd.DataFrame({'text':trn_text, 'labels':trn_label}, columns=col_names)
df_test  = pd.DataFrame({'text':test_text, 'labels':test_label}, columns=col_names)

In [6]:
CSV_PATH = DATA_PATH + 'csv_data/'

if not os.path.exists(CSV_PATH):
    os.makedirs(CSV_PATH)

In [7]:
df_train.to_csv(DATA_PATH + 'csv_data/train.csv', index=False)
df_test.to_csv(DATA_PATH  + 'csv_data/test.csv',  index=False)

In [8]:
df_train = pd.read_csv(DATA_PATH + 'csv_data/train.csv')
df_test = pd.read_csv(DATA_PATH + 'csv_data/test.csv')

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
labels    1000 non-null int64
text      1000 non-null object
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 2 columns):
labels    1428 non-null object
text      1428 non-null object
dtypes: object(2)
memory usage: 22.4+ KB


In [11]:
tokenize(df_train['text'][0], None)

{'original': 'Sin ser fan de Juan Gabriel, siempre supe que era una fuerza de la naturaleza. Hoy escuché "Querida", y me dio una ternura enorme.',
 'words': ['sin',
  'ser',
  'fan',
  'de',
  'juan',
  'gabriel',
  'siempre',
  'supe',
  'que',
  'era',
  'una',
  'fuerza',
  'de',
  'la',
  'naturaleza',
  'hoy',
  'escuche',
  'querida',
  'y',
  'me',
  'dio',
  'una',
  'ternura',
  'enorme'],
 'ratio': 0.05,
 'clean': 'sin ser fan de juan gabriel, siempre supe que era una fuerza de la naturaleza. hoy escuche querida , y me dio una ternura enorme',
 'class': None}

In [12]:
len(df_train['text'])

1000

In [13]:
LOF_train = []
LOF_test  = []

for i in range(len(df_train['text'])):
    LOF_train.append(tokenize(df_train['text'][i], None)['words'])
    
for i in range(len(df_test['text'])):
    LOF_test.append(tokenize(df_test['text'][i], None)['words'])

In [14]:
print("Size of List Of Words (Train) : ", len(LOF_train))
print("Size of List Of Words (Test)  : ", len(LOF_test))

Size of List Of Words (Train) :  1000
Size of List Of Words (Test)  :  1428


In [15]:
LOF_test[0]

['siempre', 'hermosa', 'maria', 'gabriel']

In [16]:
cnt = Counter(word for doc in LOF_train + LOF_test for word in doc )

In [17]:
len(cnt)

7861

In [18]:
cnt.most_common()[:10]

[('que', 1341),
 ('de', 1198),
 ('y', 991),
 ('no', 777),
 ('la', 770),
 ('a', 769),
 ('me', 666),
 ('el', 662),
 ('en', 608),
 ('es', 506)]

In [19]:
max_vocab = 7000
min_freq = 2

itos = [o for o,c in cnt.most_common(max_vocab) if c > min_freq]

In [20]:
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [21]:
len(itos)

1504

In [22]:
import collections

stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

1504

In [24]:
train_lm = np.array([[stoi[o] for o in p] for p in LOF_train])
test_lm  = np.array([[stoi[o] for o in p] for p in LOF_test])

In [28]:
np.savez(DATA_PATH + 'tmp/train_labels.npz', trn_label=trn_label)
np.savez(DATA_PATH + 'tmp/test_labels.npz', test_label=test_label)

In [25]:
np.savez(DATA_PATH + 'tmp/train_lm.npz', train_lm=train_lm)
np.savez(DATA_PATH + 'tmp/test_lm.npz', test_lm=test_lm)
pickle.dump(itos, open(DATA_PATH + 'tmp/itos.pkl', 'wb'))

In [26]:
train_lm[0]

[64,
 51,
 1056,
 3,
 825,
 0,
 45,
 0,
 2,
 127,
 24,
 0,
 3,
 6,
 579,
 41,
 0,
 1057,
 4,
 8,
 249,
 24,
 0,
 0]

In [91]:
tok_train = np.array(trn_lm)
tok_test  = np.array(val_lm)

#np.save(DATA_PATH + 'tmp/train_ids.npy', tok_train)
#np.save(DATA_PATH + 'tmp/test_ids.npy' , tok_test)

In [54]:
#train_ids = np.load(DATA_PATH + 'tmp/train_ids.npy')
#test_ids  = np.load(DATA_PATH + 'tmp/test_ids.npy')

In [55]:
for idx in tok_train[0]:
    print(vocab[idx], end =' ')

sin ser fan de juan gabriel siempre supe que era una fuerza de la naturaleza hoy escuche querida y me dio una ternura enorme 

In [56]:
df_train['text'][0]

'Sin ser fan de Juan Gabriel, siempre supe que era una fuerza de la naturaleza. Hoy escuché "Querida", y me dio una ternura enorme.'

In [17]:
trn_label[0]

1

In [57]:
for idx in tok_test[0]:
    print(vocab[idx], end =' ')

siempre hermosa maria gabriel 

In [58]:
cntIdx = Counter(idx for doc in list(tok_train) + list(tok_test) for idx in doc ).most_common()

In [59]:
len(cntIdx)

6151

In [60]:
cntIdx[:10]

[(148439, 2180),
 (6, 1341),
 (0, 1198),
 (5, 991),
 (19, 777),
 (2, 770),
 (7, 769),
 (189, 666),
 (4, 662),
 (3, 608)]

In [24]:
itos = [vocab[idx] for idx, _ in cntIdx]

itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
itos.pop(2)

'UNK'

In [26]:
len(itos)

6152

In [27]:
pickle.dump(itos, open(DATA_PATH + 'tmp/itos.pkl', 'wb'))

In [28]:
itos[:10]

['_unk_', '_pad_', 'que', 'de', 'y', 'no', 'la', 'a', 'me', 'el']

In [29]:
itos = pickle.load(open(DATA_PATH + 'tmp/itos.pkl', 'rb'))

In [30]:
itos[:10]

['_unk_', '_pad_', 'que', 'de', 'y', 'no', 'la', 'a', 'me', 'el']

In [31]:
import collections

stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

6152

In [32]:
train_ids = np.array([[stoi[o] for o in p] for p in train_ids])
test_ids  = np.array([[stoi[o] for o in p] for p in test_ids])

In [85]:
np.savez(DATA_PATH + 'tmp/train_ids.npz', train_ids)
np.savez(DATA_PATH + 'tmp/test_ids.npz' , train_ids)

In [35]:
train_ids[0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [100]:
train_ids = np.load(DATA_PATH + 'tmp/train_lm.npz')
test_ids  = np.load(DATA_PATH + 'tmp/test_lm.npz')
itos = pickle.load(open(DATA_PATH + 'tmp/itos.pkl', 'rb'))

In [102]:
train_ids['train_lm'][0]

[64,
 51,
 1056,
 3,
 825,
 0,
 45,
 0,
 2,
 127,
 24,
 0,
 3,
 6,
 579,
 41,
 0,
 1057,
 4,
 8,
 249,
 24,
 0,
 0]