In [1]:
import sys
sys.path.append("../")

In [2]:
import lstm_bilstm
from Utils.WordVecs import *
from Utils.MyMetrics import *
from Utils.Datasets import *
from Utils.Semeval_2013_Dataset import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
name = "semeval"
bi = True
dataset_raw = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)
dataset = lstm_bilstm.Semeval_Dataset('../datasets/semeval',
                                                None, rep=words,
                                                one_hot=True)

vecs = WordVecs('../embeddings/wiki.multi.en.vec', 'word2vec')
dim = vecs.vector_size
max_length = 0
vocab = {}
for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
    if len(sent) > max_length:
        max_length = len(sent)
    for w in sent:
        if w not in vocab:
            vocab[w] = 1
        else:
            vocab[w] += 1
            
wordvecs = {}
for w in vecs._w2idx.keys():
    if w in vocab:
        wordvecs[w] = vecs[w]

In [4]:
lstm_bilstm.add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
W, word_idx_map = lstm_bilstm.get_W(wordvecs, dim=dim)

dataset = lstm_bilstm.convert_dataset(dataset, word_idx_map, max_length)
dev_params_file = '../dev_params/300_bilstm.dev.txt'
best_dim, best_dropout, best_epoch, best_f1 = lstm_bilstm.get_dev_params(name, dev_params_file, bi,
                   dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs)

In [5]:
base_dir = '../models/bilstm/'+ name +'/run1'
best_weights = "weights.006-0.6337.hdf5"

In [6]:
clf = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))

In [7]:
pred = clf.predict(dataset._Xtest, verbose=1)
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(dataset._ytest, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

0.67003367003367


# Zero shot benchmark
- Load data from other language (tweets)
- Tokenize data
- load embeddings for that language
- run model on other language with no training data

In [8]:
import csv
import numpy as np

In [9]:
import xml.etree.ElementTree

In [100]:
# Loading Spanish data 
spanish_data_file = "../../datasets/sentiment/spanish/SEPLN-TASS15/DATA/general-tweets-train-tagged.xml"
e = xml.etree.ElementTree.parse(spanish_data_file).getroot()


In [130]:
spanish_data = []
for tweet in e:
    content = None
    polarity = None
    keep = False
    for el in tweet:
        if el.tag == 'content':
            content = el.text
        if el.tag == 'sentiments':
            assert el[0].tag == 'polarity'
            for entry in el[0]:
                if entry.tag == 'type' and entry.text == 'AGREEMENT':
                    keep = True
                if entry.tag == 'value':
                    polarity = entry.text
    if keep and content is not None and polarity != 'None':
        spanish_data.append((polarity, content))


In [131]:
# Tokenize text
import spacy
es_nlp = spacy.load('es')
test_tweets = [es_nlp(text) for label,text in spanish_data if text is not None]

In [120]:
# Load embeddings
es_file = '../../embeddings/wiki.multi.es.vec'
lang_files = [es_file]

embeddings = {}
for lang_f in lang_files:
    lang = lang_f[-6:-4]
    embeddings[lang] = {}
    with open(lang_f, 'r') as handle:
        csv_file = csv.reader(handle, delimiter=' ', quotechar="|")
        i = 0
        for row in csv_file:
            if len(row) != 301:
                continue
            word = row[0]
            vec = np.array(row[1:]).astype(np.float)
            embeddings[lang][word] = vec
            i += 1
            if i % 50000 == 0:
                print(i/2000., "% complete ...")

25.0 % complete ...
50.0 % complete ...
75.0 % complete ...
100.0 % complete ...


In [132]:
# Measure Coverage
found = []
for t in test_tweets:
    for tok in t:
        if tok.text in embeddings['es']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

0.7019815145656413


In [133]:
# create embedding weights with vocab and zeros
es_vocab = set()
for sent in test_tweets:
    for word in sent:
        if word.text in embeddings['es']:
            es_vocab.add(word.text)
        
# replace embedding in model
es_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
es_word_2_index = {}
for i, word in enumerate(es_vocab):
    es_word_2_index[word] = i+1
    es_embeddings[i+1] = embeddings['es'][word]

    
# encode sentences with new index
clf_fr = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_fr.layers[0].set_weights([es_embeddings])

In [134]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w.text] for w in sent if w.text in word_idx_map])
    return encoded

test_data = []
for sent in test_tweets:
    test_data.append(encode_sent(sent, es_word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_fr.predict(test_data)

In [144]:
true_labels = np.zeros_like(pred)
for i, (l, t) in enumerate([(l,t) for l,t in spanish_data if t is not None]):
    pos = None
    if "P" in l:
        pos = 2
    if "N" in l:
        pos = 0
    if l == 'NEU':
        pos = 1
    if pos == None:
        print("Error")
        pos = 1
    true_labels[i][pos] = 1.

In [148]:

labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(acc)

0.2338622708365429


In [164]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels[:,[0,2]], pred[:,[0,2]], labels=labels, average='binary')
acc, precision, recall, micro_f1 = mm.get_scores()
print( micro_f1)

0.6507258753202392


In [159]:
import random
random_labels = np.zeros_like(pred)
for i, x in enumerate(pred):
    if random.random() < .333:
        pos = 0
    elif random.random() > .333 and random.random() < .666:
        pos = 1
    else:
        pos = 2
    random_labels[i][pos] = 1.
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(random_labels, pred, labels=labels, average='micro')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

0.3270682483438607


In [162]:
print(np.sum(np.argmax(pred, axis=1) == 0))
print(np.sum(np.argmax(pred, axis=1) == 1))
print(np.sum(np.argmax(pred, axis=1) == 2))

392
4855
1244


In [163]:
print(np.sum(np.argmax(true_labels, axis=1) == 0))
print(np.sum(np.argmax(true_labels, axis=1) == 1))
print(np.sum(np.argmax(true_labels, axis=1) == 2))

3464
313
2714


# Arabic

In [200]:
# Loading french data 
# 1 = positive, 0 = negative
arabic_data1 = "../../datasets/sentiment/arabic/ar-embeddings/datasets/tweets/ASTD.csv"
arabic_data2 = "../../datasets/sentiment/arabic/ar-embeddings/datasets/tweets/ArTwitter.csv"
ar_labels = []
tweets = []
with open(arabic_data1) as handle:
    i = 0
    for line in handle.readlines():
        i += 1
        if i == 1:
            continue
        try:
            label = int(line[0])
        except:
            continue
        text = line[2:]
        tweets.append(text)
        ar_labels.append(label)
        
with open(arabic_data2) as handle:
    i = 0
    for line in handle.readlines():
        i += 1
        if i == 1:
            continue
        try:
            label = int(line[0])
        except:
            continue
        text = line[2:]
        tweets.append(text)
        ar_labels.append(label)

In [171]:
import pyarabic.araby as araby
test_tweets = [araby.tokenize(t) for t in tweets]

In [172]:
# Load embeddings
ar_file = '../../embeddings/wiki.multi.ar.vec'
lang_files = [ar_file]

embeddings = {}
for lang_f in lang_files:
    lang = lang_f[-6:-4]
    embeddings[lang] = {}
    with open(lang_f, 'r') as handle:
        csv_file = csv.reader(handle, delimiter=' ', quotechar="|")
        i = 0
        for row in csv_file:
            if len(row) != 301:
                continue
            word = row[0]
            vec = np.array(row[1:]).astype(np.float)
            embeddings[lang][word] = vec
            i += 1
            if i % 50000 == 0:
                print(i/2000., "% complete ...")

25.0 % complete ...
50.0 % complete ...
75.0 % complete ...
100.0 % complete ...


In [174]:
# Unfortunately we only have embeddings for 76% of the tokens
found = []
for t in tweets:
    for tok in t:
        if tok in embeddings['ar']:
            found.append(1)
        else:
            found.append(0)
print (np.mean(found))

0.8076509605129535


In [191]:
# create embedding weights with vocab and zeros
ar_vocab = set()
for sent in tweets:
    for word in sent:
        if word in embeddings['ar']:
            ar_vocab.add(word)
        
# replace embedding in model
ar_embeddings = np.zeros_like(clf.layers[0].get_weights()[0])
ar_word_2_index = {}
for i, word in enumerate(ar_vocab):
    ar_word_2_index[word] = i+1
    ar_embeddings[i+1] = embeddings['ar'][word]

    
# encode sentences with new index
clf_ar = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_ar.layers[0].set_weights([ar_embeddings])

In [192]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w] for w in sent if w in word_idx_map])
    return encoded

test_data = []
for sent in tweets:
    test_data.append(encode_sent(sent, ar_word_2_index))
test_data = lstm_bilstm.pad_sequences(test_data, max_length)
    
pred = clf_ar.predict(test_data)

In [205]:
true_labels = np.zeros_like(pred)
for i, l in enumerate(ar_labels):
    if l == 1:
        pos = 2
    if l == 0:
        pos = 0
    true_labels[i][pos] = 1.

In [206]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(true_labels[:,[0,2]], pred[:,[0,2]], labels=labels, average='binary')
acc, precision, recall, micro_f1 = mm.get_scores()
print(acc, precision, recall, micro_f1)

0.500423131170663 0.5004236091499576 0.9988726042841037 0.6667920978363124


In [213]:
np.sum(np.argmax(pred[:,[0,2]], axis=1)==1)

3541