# Sentiment Analysis usando Deep Learning para español en textos cortos

# 1. Data Cleaning 

We work on a dataset of tweets labelled as positive or negative, more information about the dataset 
can be found on http://www.sepln.org/workshops/tass/. 

The dataset that we used can download here:

- http://www.sepln.org/workshops/tass/2017/#datasets
- http://www.sepln.org/workshops/tass/2018/#datasets

Some description about this datasets:

1. ----
2. ----
3. ----

# 2. Vocabulary

## 2.2 Getting train, validation and test subsets

In [23]:
import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import pandas as pd

In [7]:
train_docs= xml.readXML("../database/TASS/TASS2018/task1-Training.xml")
test_docs= xml.readXML("../database/TASS/TASS2018/task1-Development.xml")

In [8]:
train_tweets = []
train_labels = []
for doc in train_docs:
    train_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    train_labels.append(doc.polarity)

test_tweets = []
test_labels = []
for doc in test_docs:
    test_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    test_labels.append(doc.polarity)

In [44]:
from sklearn.model_selection import train_test_split

SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = (pd.Series(train_tweets), pd.Series(test_tweets), pd.Series(train_labels), pd.Series(test_labels))
#x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [45]:
fmt1_ = "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive"
print(fmt1_.format(len(x_train),
      (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
      (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
      
fmt2_ = "Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive"
print(fmt2_.format(len(x_validation),
      (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
      (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))

fmt3_ = "Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive"
print(fmt3_.format(len(x_test),
      (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
      (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 473 entries with 51.16% negative, 48.84% positive
Validation set has total 100 entries with 54.00% negative, 46.00% positive
Test set has total 101 entries with 51.49% negative, 48.51% positive


## 2.3 Sentence as a list of words

In [46]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

Function that replace sentences as a list of words.

In [47]:
def labelize_tweets(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

Concat train, validation and test subsets.

In [49]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets(all_x, 'all')

## 2.4 Building Vocabulary

### 2.4.1 Using a Continuous Bag Of Words

In [50]:
cores = multiprocessing.cpu_count()
model_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 674/674 [00:00<?, ?it/s]


In [51]:
%%time
for epoch in range(30):
    model_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_cbow.alpha -= 0.002
    model_cbow.min_alpha = model_ug_cbow.alpha

100%|██████████| 674/674 [00:00<?, ?it/s]


NameError: name 'model_ug_cbow' is not defined

## 2.4.2 Using a Skip-gram

In [52]:
model_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 674/674 [00:00<?, ?it/s]


In [53]:
%%time
for epoch in range(30):
    model_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_sg.alpha -= 0.002
    model_sg.min_alpha = model_ug_sg.alpha

100%|██████████| 674/674 [00:00<?, ?it/s]


NameError: name 'model_ug_sg' is not defined

In [54]:
model_cbow.save('data/vocab/w2v_model_cbow.word2vec')
model_sg.save('data/vocab/w2v_model_sg.word2vec')

FileNotFoundError: [Errno 2] No such file or directory: 'data/vocab/w2v_model_cbow.word2vec'

# 3. Preparing Data

## 3.1 Word to Number

In [None]:
from gensim.models import KeyedVectors

model_cbow = KeyedVectors.load('data/vocab/w2v_model_cbow.word2vec')
model_sg   = KeyedVectors.load('data/vocab/w2v_model_sg.word2vec')

In [None]:
len(model_cbow.wv.vocab.keys())

We **concatenate vectors of the two models**, build a only vector of 200 dimension, 100 of each model.

In [None]:
embeddings_index = {}
for w in model_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_cbow.wv[w],model_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Each word is represented as a number.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [None]:
len(tokenizer.word_index)

In [None]:
for x in x_train[:5]:
    print(x)

In [None]:
sequences[:5]

In [None]:
length = []
for x in x_train:
    length.append(len(x.split()))

In [None]:
max(length)

Setting all the data to have the same length of 45

In [None]:
x_train_seq = pad_sequences(sequences, maxlen=45)
print('Shape of data tensor:', x_train_seq.shape)

By default, if a sentence length is shorter than the maximum length the function replace by zero value at the beginning.

In [None]:
x_train_seq[:5]

In [None]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq     = pad_sequences(sequences_val, maxlen=45)

When we transform sentences to sequence we set a number to each sentence, these numbers are positions of where word embeddings are stored in the index word of the tokenizer. We build a matrix where the number will be the index of its embedding vector of word.

In [None]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
np.array_equal(embedding_matrix[8] ,embeddings_index.get('you'))

# Model: Convolutional Neural Network

In [None]:
seed = 7

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D

tweet_input = Input(shape=(45,), dtype='int32')

tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=True)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint

filepath="model/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, y_train, batch_size=32, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])

In [None]:
from keras.models import load_model

loaded_CNN_model = load_model('model/CNN_best_weights.02-0.8333.hdf5')
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec.fit(x_train)

In [None]:
x_train_tfidf = tvec.transform(x_train)
x_test_tfidf = tvec.transform(x_test)

In [None]:
lr_with_tfidf = LogisticRegression()
lr_with_tfidf.fit(x_train_tfidf,y_train)

In [None]:
lr_with_tfidf.score(x_test_tfidf,y_test)

In [None]:
yhat_lr = lr_with_tfidf.predict_proba(x_test_tfidf)

In [None]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=45)

In [None]:
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)

In [None]:
yhat_cnn = loaded_CNN_model.predict(x_test_seq)

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, threshold = roc_curve(y_test, yhat_lr[:,1])
roc_auc = auc(fpr, tpr)
fpr_cnn, tpr_cnn, threshold = roc_curve(y_test, yhat_cnn)
roc_auc_nn = auc(fpr_cnn, tpr_cnn)
plt.figure(figsize=(8,7))
plt.plot(fpr, tpr, label='tfidf-logit (area = %0.3f)' % roc_auc, linewidth=2)
plt.plot(fpr_cnn, tpr_cnn, label='w2v-CNN (area = %0.3f)' % roc_auc_nn, linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', linewidth=2)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver operating characteristic: is positive', fontsize=18)
plt.legend(loc="lower right")
plt.show()