In [None]:
import sys
sys.path.append("../")

In [None]:
import lstm_bilstm
from Utils.WordVecs import *
from Utils.MyMetrics import *
from Utils.Datasets import *
from Utils.Semeval_2013_Dataset import *

In [None]:
name = "semeval"
bi = True
dataset_raw = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)
dataset = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)

vecs = WordVecs('../embeddings/wiki.multi.en.vec', 'word2vec')
dim = vecs.vector_size
max_length = 0
vocab = {}
for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
    if len(sent) > max_length:
        max_length = len(sent)
    for w in sent:
        if w not in vocab:
            vocab[w] = 1
        else:
            vocab[w] += 1
            
wordvecs = {}
for w in vecs._w2idx.keys():
    if w in vocab:
        wordvecs[w] = vecs[w]

In [None]:
lstm_bilstm.add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
W, word_idx_map = lstm_bilstm.get_W(wordvecs, dim=dim)

dataset = lstm_bilstm.convert_dataset(dataset, word_idx_map, max_length)
dev_params_file = '../dev_params/300_bilstm.dev.txt'
best_dim, best_dropout, best_epoch, best_f1 = lstm_bilstm.get_dev_params(name, dev_params_file, bi,
                   dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs)

In [None]:
base_dir = '../models/bilstm/'+ name +'/run1'
best_weights = "weights.006-0.6337.hdf5"

In [None]:
clf = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))

In [None]:
pred = clf.predict(dataset._Xtest, verbose=1)
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(dataset._ytest, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

# Zero shot benchmark
- Load data from other language (tweets)
- Tokenize data
- load embeddings for that language
- run model on other language with no training data

In [None]:
import csv
import numpy as np

In [None]:
# Loading french data 
french_data = "../../datasets/sentiment/french/French-Sentiment-Analysis-Dataset/tweets.csv"
labels = []
tweets = []
with open(french_data) as handle:
    i = 0
    for line in handle.readlines():
        i += 1
        if i == 1:
            continue
        try:
            label = int(line[0])
        except:
            continue
        text = line[2:]
        tweets.append(text)
        labels.append(label)

In [None]:
# Downsample 500 from each class of 0, 2 and 4
test_tweets = []
test_labels = []
zero = 0
two = 0
four = 0
max_count = 500
for l, t in zip(labels, tweets):
    if zero < max_count and l == 0:
        zero += 1
        test_tweets.append(t)
        test_labels.append(l)
    if two < max_count and l == 2:
        two += 1
        test_tweets.append(t)
        test_labels.append(l)
    if four < max_count and l == 4:
        four += 1
        test_tweets.append(t)
        test_labels.append(l)
        

In [None]:
# Tokenize text
import spacy
fr_nlp = spacy.load('fr')
test_tweets = [fr_nlp(t) for t in test_tweets]

In [None]:
# Load embeddings
fr_file = '../../embeddings/wiki.multi.fr.vec'
it_file = '../../embeddings/wiki.multi.it.vec'
lang_files = [fr_file, it_file]

embeddings = {}
for lang_f in lang_files:
    lang = lang_f[-6:-4]
    embeddings[lang] = {}
    with open(lang_f, 'r') as handle:
        csv_file = csv.reader(handle, delimiter=' ', quotechar="|")
        i = 0
        for row in csv_file:
            if len(row) != 301:
                continue
            word = row[0]
            vec = np.array(row[1:]).astype(np.float)
            embeddings[lang][word] = vec
            i += 1
            if i % 50000 == 0:
                print(i/2000., "% complete ...")

In [None]:
# Unfortunately we only have embeddings for 76% of the tokens
found = []
for t in test_tweets:
    for tok in t:
        if tok.text in embeddings['fr']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

In [None]:
# create embedding weights with vocab and zeros
fr_vocab = set()
for sent in test_tweets:
    for word in sent:
        if word.text in embeddings['fr']:
            fr_vocab.add(word.text)
        
# replace embedding in model
fr_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
fr_word_2_index = {}
for i, word in enumerate(fr_vocab):
    fr_word_2_index[word] = i+1
    fr_embeddings[i+1] = embeddings['fr'][word]

    
# encode sentences with new index
clf_fr = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_fr.layers[0].set_weights([fr_embeddings])

In [None]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text] for w in sent if w.text in word_idx_map])
    return encoded

test_data = []
for sent in test_tweets:
    test_data.append(encode_sent(sent, fr_word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_fr.predict(test_data)

In [None]:
true_labels = np.zeros_like(pred)
for i, l in enumerate(test_labels):
    pos = int(l/2)
    true_labels[i][pos] = 1.

In [None]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

In [None]:
import random
random_labels = np.zeros_like(pred)
for i, x in enumerate(test_labels):
    l = 4 if random.random() > .5 else 0
    random_labels[i][int(l/2)] = 1.
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(random_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

In [None]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels[:,[0,2]], pred[:,[0,2]], labels=labels, average='binary')
acc, precision, recall, micro_f1 = mm.get_scores()
print(acc, precision, recall, micro_f1)

# Results
On French tweets we achieve __.36 F1__. It is low because we trained 3 classes (Positive, negative, neutral) but tested on only 2 classes (Positive, negative).  
If we use take the argmax of only the positive and negative classes, we acheive __.67 F1__.  
The results on the english dataset was also __.67 F1__.  

# Italian

In [None]:
# Loading french data 
italian_data = "../../datasets/sentiment/italian/test_set_sentipolc16_gold2000.csv"
test_labels = []
tweets = []
with open(italian_data) as handle:
    i = 0
    reader = csv.reader(handle)
    for row in reader:
        i += 1
        text = row[8]
        label_p = int(row[2])
        label_n = int(row[3])
        label = 1
        if label_p == 1:
            label = 2
        if label_n == 1:
            label = 0
        if label_p + label_n == 2:
            continue
        tweets.append(text)
        test_labels.append(label)

In [None]:
# Tokenize text
it_nlp = spacy.load('it')
test_tweets = [it_nlp(t) for t in tweets]

In [None]:
found = []
for t in test_tweets:
    for tok in t:
        if tok.text in embeddings['it']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

In [None]:
# create embedding weights with vocab and zeros
it_vocab = set()
for sent in test_tweets:
    for word in sent:
        if word.text in embeddings['it']:
            it_vocab.add(word.text)
        
# replace embedding in model
it_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
it_word_2_index = {}
for i, word in enumerate(it_vocab):
    it_word_2_index[word] = i+1
    it_embeddings[i+1] = embeddings['it'][word]

    
# encode sentences with new index
clf_it = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_it.layers[0].set_weights([it_embeddings])

In [None]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text] for w in sent if w.text in word_idx_map])
    return encoded

test_data = []
for sent in test_tweets:
    test_data.append(encode_sent(sent, it_word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_it.predict(test_data)

In [None]:
true_labels = np.zeros_like(pred)
for i, l in enumerate(test_labels):
    true_labels[i][l] = 1.
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

# Italian Results
In Italian we acheive __.51 F1__ on 3 class classification.  
Only 69% of tokens are in vocabulary for us. 