# Comparison Benchmarks
We will compare the previous results to:  
(1) Training on the original data  
(2) Evaluating the translations  

## Generate translations for test data

In [3]:
import requests

In [14]:
params = {
    'key':'trnsl.1.1.20180425T012837Z.34aff5ff1fbca7f9.449ba9cd74e4c4e829e77e60507b69d30b425d56',
    'text':['hello world', 'this is great', 'one more'],
    'lang':'es'
}
r = requests.post('https://translate.yandex.net/api/v1.5/tr.json/translate', data=params)

In [15]:
r.json()['text']

['hola mundo', 'esta es una gran', 'uno de los más']

In [16]:
import csv
import numpy as np

In [72]:
# Loading french data 
french_data = "../../datasets/sentiment/french/French-Sentiment-Analysis-Dataset/tweets.csv"
labels = []
tweets = []
with open(french_data) as handle:
    i = 0
    for line in handle.readlines():
        i += 1
        if i == 1:
            continue
        try:
            label = int(line[0])
        except:
            continue
        text = line[2:]
        tweets.append(text)
        labels.append(label)

In [73]:
# Downsample 500 from each class of 0, 2 and 4
test_tweets = []
test_labels = []
zero = 0
two = 0
four = 0
max_count = 500
for l, t in zip(labels, tweets):
    if zero < max_count and l == 0:
        zero += 1
        test_tweets.append(t)
        test_labels.append(l)
    if two < max_count and l == 2:
        two += 1
        test_tweets.append(t)
        test_labels.append(l)
    if four < max_count and l == 4:
        four += 1
        test_tweets.append(t)
        test_labels.append(l)
        

In [27]:
np.mean([len(t) for t in test_tweets])

80.839

In [23]:
# Loading italian data 
italian_data = "../../datasets/sentiment/italian/test_set_sentipolc16_gold2000.csv"
test_labels = []
tweets = []
with open(italian_data) as handle:
    i = 0
    reader = csv.reader(handle)
    for row in reader:
        i += 1
        text = row[8]
        label_p = int(row[2])
        label_n = int(row[3])
        label = 1
        if label_p == 1:
            label = 2
        if label_n == 1:
            label = 0
        if label_p + label_n == 2:
            continue
        tweets.append(text)
        test_labels.append(label)

In [26]:
np.mean([len(t) for t in tweets])

109.70438328236493

## Download and store all translations

In [43]:
def get_translation(text_list, lang):
    params = {
        'key':'trnsl.1.1.20180425T012837Z.34aff5ff1fbca7f9.449ba9cd74e4c4e829e77e60507b69d30b425d56',
        'text':text_list,
        'lang':lang
    }
    r = requests.post('https://translate.yandex.net/api/v1.5/tr.json/translate', data=params)
    return r.json()['text']

In [44]:
# req_size = 50
# i = 0 
# french_translations = []
# while i < len(test_tweets):
#     end = i + req_size
#     if end > len(test_tweets):
#         end = len(test_tweets)
#     trans = get_translation(test_tweets[i:end], 'fr-en')
#     french_translations += trans
#     i += req_size


# req_size = 50
# i = 0 
# italian_translations = []
# while i < len(tweets):
#     end = i + req_size
#     if end > len(tweets):
#         end = len(tweets)
#     trans = get_translation(tweets[i:end], 'it-en')
#     italian_translations += trans
#     i += req_size

In [45]:
# import pickle
# with open("french_trans.pkl", 'wb') as handle:
#     pickle.dump(french_translations, handle)
    
# with open("it_trans.pkl", 'wb') as handle:
#     pickle.dump(italian_translations, handle)

In [46]:
len(french_translations)

1000

In [47]:
len(italian_translations)

1962

In [48]:
italian_translations[-1]

'You do not see the hour that is here, at least, I will miss you more often and I will finally be able to be with him, in the face of the one who has friendzonato.'

# French translation eval

In [53]:
# Tokenize text
import spacy
nlp = spacy.load('en')
test_tweets = [nlp(t) for t in french_translations]

In [64]:
# Load embeddings
#fr_file = '../../embeddings/wiki.multi.fr.vec'
#it_file = '../../embeddings/wiki.multi.it.vec'
en_file = '../../embeddings/wiki.multi.en.vec'
lang_files = [en_file]

embeddings = {}
for lang_f in lang_files:
    lang = lang_f[-6:-4]
    embeddings[lang] = {}
    with open(lang_f, 'r') as handle:
        csv_file = csv.reader(handle, delimiter=' ', quotechar="|")
        i = 0
        for row in csv_file:
            if len(row) != 301:
                continue
            word = row[0]
            vec = np.array(row[1:]).astype(np.float)
            embeddings[lang][word] = vec
            i += 1
            if i % 50000 == 0:
                print(i/2000., "% complete ...")

25.0 % complete ...
50.0 % complete ...
75.0 % complete ...
100.0 % complete ...


In [55]:
# Unfortunately we only have embeddings for 76% of the tokens
found = []
for t in test_tweets:
    for tok in t:
        if tok.text in embeddings['en']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

0.7635722812802


In [57]:
import sys
sys.path.append("../")

In [61]:
import lstm_bilstm
import os

In [62]:
name = "semeval"
base_dir = '../models/bilstm/'+ name +'/run1'
best_weights = "weights.006-0.6337.hdf5"
clf = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))

In [65]:
# create embedding weights with vocab and zeros
vocab = set()
for sent in test_tweets:
    for word in sent:
        if word.text in embeddings['en']:
            vocab.add(word.text)
        
# replace embedding in model
en_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
word_2_index = {}
for i, word in enumerate(vocab):
    word_2_index[word] = i+1
    en_embeddings[i+1] = embeddings['en'][word]

    
# encode sentences with new index
clf_fr = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_fr.layers[0].set_weights([en_embeddings])

In [69]:
from Utils.WordVecs import *
from Utils.MyMetrics import *
from Utils.Datasets import *
from Utils.Semeval_2013_Dataset import *
dataset = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)
max_length = 0
for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
    if len(sent) > max_length:
        max_length = len(sent)


In [70]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text] for w in sent if w.text in word_idx_map])
    return encoded

test_data = []
for sent in test_tweets:
    test_data.append(encode_sent(sent, word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_fr.predict(test_data)

In [74]:
true_labels = np.zeros_like(pred)
for i, l in enumerate(test_labels):
    pos = int(l/2)
    true_labels[i][pos] = 1.

In [75]:
# labels = sorted(set(dataset._ytrain.argmax(1)))
# mm = MyMetrics(true_labels, pred, labels=labels, average='micro')
# acc, precision, recall, micro_f1 = mm.get_scores()
# print(micro_f1)

0.373


In [77]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels[:,[0,2]], pred[:,[0,2]], labels=labels, average='binary')
acc, precision, recall, micro_f1 = mm.get_scores()
print(acc, precision, recall, micro_f1)

0.616 0.5845481049562682 0.802 0.6762225969645869


## Italian

In [201]:
# Loading french data 
italian_data = "../../datasets/sentiment/italian/test_set_sentipolc16_gold2000.csv"
test_labels = []
tweets = []
with open(italian_data) as handle:
    i = 0
    reader = csv.reader(handle)
    for row in reader:
        i += 1
        text = row[8]
        label_p = int(row[2])
        label_n = int(row[3])
        label = 1
        if label_p == 1:
            label = 2
        if label_n == 1:
            label = 0
        if label_p + label_n == 2:
            continue
        tweets.append(text)
        test_labels.append(label)

In [209]:
np.sum(np.array(test_labels) == 0)

733

In [210]:
# Tokenize text
it_nlp = spacy.load('en')
test_tweets = [it_nlp(t) for t in italian_translations]

In [211]:
found = []
for t in test_tweets:
    for tok in t:
        if tok.text in embeddings['en']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

0.723447680351676


In [212]:
# create embedding weights with vocab and zeros
it_vocab = set()
for sent in test_tweets:
    for word in sent:
        if word.text.lower() in embeddings['en']:
            it_vocab.add(word.text.lower())
        
# replace embedding in model
it_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
it_word_2_index = {}
for i, word in enumerate(it_vocab):
    it_word_2_index[word] = i+1
    it_embeddings[i+1] = embeddings['en'][word]

    
# encode sentences with new index
clf_it = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_it.layers[0].set_weights([it_embeddings])

In [215]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text.lower()] for w in sent if w.text.lower() in word_idx_map])
    return encoded

test_data = []
for sent in test_tweets:
    test_data.append(encode_sent(sent, it_word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_it.predict(test_data)

In [250]:
# true_labels = np.zeros_like(pred)
# for i, l in enumerate(test_labels):
#     true_labels[i][l] = 1.
true_labels = keras.utils.to_categorical(test_labels)
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

0.4031600407747197


# Results
French translation results: F1 - 0.676  
Italian translation results F1 - 0.427  

French zero-shot: F1 - 0.669  
Italian zero-shot: F1 - 0.513  

# Training with data on different languages

In [86]:
len(dataset._Xtrain)

6021

In [87]:
# Loading french data 
french_data = "../../datasets/sentiment/french/French-Sentiment-Analysis-Dataset/tweets.csv"
labels = []
tweets = []
with open(french_data) as handle:
    i = 0
    for line in handle.readlines():
        i += 1
        if i == 1:
            continue
        try:
            label = int(line[0])
        except:
            continue
        text = line[2:]
        tweets.append(text)
        labels.append(label)

In [106]:
# Downsample 500 from each class of 0, 2 and 4
fr_test_tweets = []
fr_test_labels = []
fr_train_tweets = []
fr_train_labels = []
zero = 0
four = 0
max_count = 3500
max_count_test = 500
for l, t in zip(labels, tweets):
    if zero < max_count_test and l == 0:
        zero += 1
        fr_test_tweets.append(t)
        fr_test_labels.append(l)
    if four < max_count_test and l == 4:
        four += 1
        fr_test_tweets.append(t)
        fr_test_labels.append(1)
    if zero >= max_count_test and zero < max_count and l == 0:
        zero += 1
        fr_train_tweets.append(t)
        fr_train_labels.append(l)
    if four >= max_count_test and four < max_count and l == 4:
        four += 1
        fr_train_tweets.append(t)
        fr_train_labels.append(1)
        

In [104]:
len(fr_train_labels)

6000

In [116]:
fr_nlp = spacy.load('fr')
fr_train_tweets = [fr_nlp(t) for t in fr_train_tweets]
fr_test_tweets = [fr_nlp(t) for t in fr_test_tweets]

# Train the raw French model

In [189]:
name = "semeval"
bi = True
dataset_raw = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)
dataset = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)

vecs = WordVecs('../../embeddings/wiki.multi.fr.vec', 'word2vec')
dim = vecs.vector_size
max_length = 57
vocab = {}
# for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
#     if len(sent) > max_length:
#         max_length = len(sent)
#     for w in sent:
#         if w not in vocab:
#             vocab[w] = 1
#         else:
#             vocab[w] += 1

for sent in fr_train_tweets:
    for w in sent:
        if w.text not in vocab:
            vocab[w.text.lower()] = 1
        else:
            vocab[w.text.lower()] += 1
            
wordvecs = {}
for w in vecs._w2idx.keys():
    if w in vocab:
        wordvecs[w] = vecs[w]

In [190]:
# dataset train and val x,y
train = False
W, word_idx_map = lstm_bilstm.get_W(wordvecs)
output_dim = 2
name = 'semeval'
bi=True
dev_params_file = '../dev_params/300_bilstm.dev.txt'
best_dim, best_dropout, best_epoch, best_f1 = lstm_bilstm.get_dev_params(name, dev_params_file, bi,
                   dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs)

In [198]:
import keras

def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text.lower()] for w in sent if w.text.lower() in word_idx_map])
    return encoded

fr_train_data = []
for sent in fr_train_tweets:
    fr_train_data.append(encode_sent(sent, word_idx_map))
fr_train_data = lstm_bilstm.pad_sequences(fr_train_data, max_length)

fr_test_data = []
for sent in fr_test_tweets:
    fr_test_data.append(encode_sent(sent, word_idx_map))
fr_test_data = lstm_bilstm.pad_sequences(fr_test_data, max_length)

fr_train_y = keras.utils.to_categorical(fr_train_labels)
fr_test_y = keras.utils.to_categorical(fr_test_labels)

In [199]:
import pathlib
from keras.callbacks import ModelCheckpoint
train=False
new_name="french_basic"
run =2
clf_french = lstm_bilstm.create_BiLSTM(wordvecs, best_dim, output_dim, best_dropout, weights=W, train=train)
pathlib.Path('models/bilstm/' + new_name +'/run'+ str(run)).mkdir(parents=True, exist_ok=True)
checkpoint = ModelCheckpoint('models/bilstm/' + new_name +'/run'+ str(run)+'/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')


h = clf_french.fit(fr_train_data, fr_train_y, validation_data=[fr_test_data, fr_test_y],
            epochs=best_epoch, verbose=1, callbacks=[checkpoint])

Train on 6000 samples, validate on 1000 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.64700, saving model to models/bilstm/french_basic/run2/weights.001-0.6470.hdf5
Epoch 2/6

Epoch 00002: val_acc did not improve
Epoch 3/6

Epoch 00003: val_acc improved from 0.64700 to 0.67300, saving model to models/bilstm/french_basic/run2/weights.003-0.6730.hdf5
Epoch 4/6

Epoch 00004: val_acc improved from 0.67300 to 0.68600, saving model to models/bilstm/french_basic/run2/weights.004-0.6860.hdf5
Epoch 5/6

Epoch 00005: val_acc did not improve
Epoch 6/6

Epoch 00006: val_acc improved from 0.68600 to 0.69300, saving model to models/bilstm/french_basic/run2/weights.006-0.6930.hdf5


In [200]:
pred = clf_french.predict(fr_test_data)
true_labels = np.zeros_like(pred)
# for i, l in enumerate(fr_test_labels):
#     true_labels[i][l] = 1.
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(fr_test_y, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

0.693
