In [None]:
import lstm_bilstm
from Utils.WordVecs import *
from Utils.MyMetrics import *
from Utils.Datasets import *
from Utils.Semeval_2013_Dataset import *

In [None]:
name = "sst_binary"
bi = True
dataset_raw = lstm_bilstm.Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=True,
                                            rep=words)
dataset = lstm_bilstm.Stanford_Sentiment_Dataset('datasets/stanford_sentanalysis',
                                            None,
                                            one_hot=True,
                                            binary=True,
                                            rep=words)

vecs = WordVecs('embeddings/wiki.multi.en.vec', 'word2vec')
dim = vecs.vector_size
max_length = 0
vocab = {}
for sent in list(dataset._Xtrain) + list(dataset._Xdev) + list(dataset._Xtest):
    if len(sent) > max_length:
        max_length = len(sent)
    for w in sent:
        if w not in vocab:
            vocab[w] = 1
        else:
            vocab[w] += 1
            
wordvecs = {}
for w in vecs._w2idx.keys():
    if w in vocab:
        wordvecs[w] = vecs[w]

In [None]:
lstm_bilstm.add_unknown_words(wordvecs, vocab, min_df=1, dim=dim)
W, word_idx_map = lstm_bilstm.get_W(wordvecs, dim=dim)

dataset = lstm_bilstm.convert_dataset(dataset, word_idx_map, max_length)
dev_params_file = 'dev_params/300_bilstm.dev.txt'
best_dim, best_dropout, best_epoch, best_f1 = lstm_bilstm.get_dev_params(name, dev_params_file, bi,
                   dataset._Xtrain, dataset._ytrain, dataset._Xdev, dataset._ydev, wordvecs)

In [None]:
base_dir = 'models/bilstm/'+ name +'/run1'
best_weights = "weights.005-0.8062.hdf5"

In [None]:
clf = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))

In [None]:
pred = clf.predict(dataset._Xtest, verbose=1)        

In [None]:
# Don't need this one, it just converts above to hard label
classes = clf.predict_classes(dataset._Xtest, verbose=1)

In [None]:
labels = sorted(set(dataset._ytrain.argmax(1)))
mm = MyMetrics(dataset._ytest, pred, labels=labels, average='binary')
acc, precision, recall, micro_f1 = mm.get_scores()
print(micro_f1)

In [None]:
def encode_sent(sent, word_idx_map, max_length=57):
    encoded = np.array([word_idx_map[w] for w in sent])
    return lstm_bilstm.pad_sequences([encoded], max_length)

In [None]:
assert np.array_equal(encode_sent(dataset_raw._Xtest[0], word_idx_map)[0], dataset._Xtest[0]), "encode sentence not functioning"

In [None]:
test_sent = encode_sent("really great movie loved it".lower().split(), word_idx_map)
clf.predict(test_sent, verbose=1)

# Evaluate Random input from other languages

In [None]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

In [None]:
en_file = '../embeddings/wiki.multi.en.vec'
heb_file = '../embeddings/wiki.multi.he.vec'
rus_file = '../embeddings/wiki.multi.ru.vec'
lang_files = [en_file, heb_file, rus_file]

In [None]:
embeddings = {}
for lang_f in lang_files:
    lang = lang_f[-6:-4]
    embeddings[lang] = {}
    with open(lang_f, 'r') as handle:
        csv_file = csv.reader(handle, delimiter=' ', quotechar="|")
        i = 0
        for row in csv_file:
            if len(row) != 301:
                continue
            word = row[0]
            vec = np.array(row[1:]).astype(np.float)
            embeddings[lang][word] = vec
            i += 1
            if i % 50000 == 0:
                print(i/2000., "% complete ...")

In [None]:
saved_emb_weights = clf.layers[0].get_weights()

In [None]:
new_weights = np.array(saved_emb_weights[0])

# Test new languages

In [None]:
# get vocabulary
he_test_sents = [
    "ה סרט היה משעמם ו בזבוז זמן".split(),
    "שנא תי את ה סרט ה זה רוצה את ה כסף שלי חזרה".split(),
    "זה היה מדהים אני רוצה לראות אותו עוד אלף פעמים".split(),
    "פשוט תענוג כל ה כבוד ל שחקנים".split(),
    "רע".split(),
    "משעמם".split(),
    "טוב מאוד".split(),
    "מדהים".split()
]
ru_test_sents = [
    "фильм был тупой и не интересный".split(),
    "ужасный фильм совсем не любил".split(),
    "замечательный фильм очень понравилось".split(),
    "я очень любил фильм было интересно и весело".split(),
    "плохо".split(),
    "скучно".split(),
    "очень хорошо".split(),
    "замечательно".split()
]
# create embedding weights with vocab and zeros
he_vocab = set()
for sent in he_test_sents:
    for word in sent:
        assert word in embeddings['he'], "Didn't find %s"% word
        he_vocab.add(word)
ru_vocab = set()
for sent in ru_test_sents:
    for word in sent:
        assert word in embeddings['ru'], "Didn't find %s"% word
        ru_vocab.add(word)
        
# replace embedding in model
ru_embeddings = np.zeros_like(saved_emb_weights[0])
he_embeddings = np.zeros_like(saved_emb_weights[0])
ru_word_2_index = {}
for i, word in enumerate(ru_vocab):
    ru_word_2_index[word] = i
    ru_embeddings[0] = embeddings['ru'][word]
he_word_2_index = {}
for i, word in enumerate(he_vocab):
    he_word_2_index[word] = i
    he_embeddings[0] = embeddings['he'][word]
    
# encode sentences with new index
clf_ru = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_ru.layers[0].set_weights([ru_embeddings])
clf_he = lstm_bilstm.load_model(os.path.join(base_dir, best_weights))
clf_ru.layers[0].set_weights([he_embeddings])

In [None]:
for sent in ru_test_sents:
    test_sent = encode_sent(sent, ru_word_2_index)
    pred = clf_ru.predict(test_sent, verbose=1)
    print(pred)
    print(sent)
    print("#"*20)
    
for sent in he_test_sents:
    test_sent = encode_sent(sent, he_word_2_index)
    pred = clf_he.predict(test_sent, verbose=1)
    print(pred)
    print(sent)
    print("#"*20)

In [None]:
test_sent = encode_sent("film was dumb and not interesting".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("awful film completely no love".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("wonderful film very liked".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("i much loved film was interesting and fun".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("the movie was boring and waste time".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("i hated this movie want my money back".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("this was wonderful i want to see it another thousand times".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("simply pleasure all the respect to actors".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

In [None]:
test_sent = encode_sent("bad".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("boring".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("very good".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))

test_sent = encode_sent("amazing".lower().split(), word_idx_map)
print(clf.predict(test_sent, verbose=1))