From 36b62464fb96b75745a52ed6e0cb19cc9645b427 Mon Sep 17 00:00:00 2001
From: Minh Le <minhle.r7@gmail.com>
Date: Fri, 19 Jan 2018 23:31:27 +0100
Subject: [PATCH] add <eos> to prepare-lstm-wsd

---
 README.md                 |   27 +-
 compile_results.py        |   31 +-
 configs.py                |    1 -
 das5/prepare-lstm-wsd.job |    4 +
 das5/process-gigaword.job |    1 -
 diary-minh2.md            |   37 +-
 difference-edited.txt     | 1719 +++++++++++++++++++++++++++++++++++++
 prepare-lstm-wsd.py       |   39 +-
 process-gigaword.py       |    3 +-
 stats_gigaword.py         |    3 +-
 10 files changed, 1824 insertions(+), 41 deletions(-)
 create mode 100755 das5/prepare-lstm-wsd.job
 mode change 100644 => 100755 das5/process-gigaword.job
 create mode 100644 difference-edited.txt
diff --git a/README.md b/README.md
index c5864cd..17b2fb3 100644
--- a/README.md
+++ b/README.md
@@ -104,11 +104,24 @@ This creates a development set for the label propagation:
 a) annotated corpus: pwgc
 b) unannotated corpus: omsti
 
-#### Reproduce variation experiment
+#### Model size experiements
 
-0. `git checkout a453bc1`
-1. Pre-process GigaWord into plain text: `sbatch cartesius/process-gigaword.job`
-2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
+Notice that there was uncertainty about the real version that produce h2048p512
+and h512p128, see `difference-edited.txt` for a comparison with a recent version.
+
+1. h=2048, p=512: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-google-model.job`
+2. h=512, p=128: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-large-model.job`
+3. h=512, p=64: see `exp-h256p64.sh` in "stability" section
+4. h=100, p=10: see `exp-variation*.job` in "stability" section
+
+#### Reproduce variation/stability experiments
+
+These experiments measure how much the performance is affected by the randomness
+in training. Basically, we train smaller models many times, each time with 
+a different (but fixed) random seed.
+
+1. Pre-process GigaWord into plain text: `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
+2. More preprocessing to make binary files: `git checkout a453bc1 && sbatch cartesius/prepare-lstm-wsd.job`
 0. `git checkout ce8a024`
 1. Run at the same time: `sbatch cartesius/exp-variation1.job` and `cartesius/sbatch exp-variation2.job`
 0. `git checkout a74bda6`
@@ -118,18 +131,18 @@ b) unannotated corpus: omsti
 2. When everything finishes, do `git checkout 42bc700` 
 3. Run `sbatch cartesius/exp-variation-score.job`
 
-#### Reproduce optimization experiment
+#### Reproduce (training speed) optimization experiment
 
+1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
 0. `git checkout a74bda6`
-1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job`
 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
 3. `git checkout e93fdb2`
 4. Run in parallel: `sbatch cartesius/exp-optimization{i}.job` where i=1,2,3,4
 
 #### Data size experiment
 
+1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
 0. `git checkout a74bda6`
-1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job`
 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
 3. `git checkout 4e4a04a`
 4. Run `sbatch cartesius/exp-data-size.job {i}` with i="01",10,25,50,75
diff --git a/compile_results.py b/compile_results.py
index 71bcd95..528b9c3 100644
--- a/compile_results.py
+++ b/compile_results.py
@@ -11,6 +11,8 @@
 import matplotlib.pyplot as plt
 from sklearn.linear_model.base import LinearRegression
 import configs
+from configs import SmallConfig, H256P64, LargeConfig, GoogleConfig,\
+    DefaultConfig
 
 ModelPerformance = namedtuple('ModelPerformance', ['name', 'semcor', 'mun'])
 
@@ -118,17 +120,20 @@ def draw_data_size_vs_performance_chart():
     print('Extrapolated data size:')
     print(lr.predict([[0.75], [0.8]]))
 
+def compute_num_params(vocab_size, p, h):
+    return (vocab_size*p*2 + # input and output embeddings
+            p*h + h*h + h + # input gates
+            p*h + h*h + h + # candidate states
+            p*h + h*h + h + # forget gates
+            p*h + h*h + h*h + h + # output gates
+            p*h # context layer
+            )    
+
 def draw_capacity_vs_performance_chart():
     ''' Create figure for paper '''
     df = pd.read_csv('output/capacity_vs_performance.csv')
     vocab_size = configs.DefaultConfig.vocab_size
-    df['num_params'] = (vocab_size*df['p']*2 + # input and output embeddings
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # input gates
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # candidate states
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # forget gates
-                        df['p']*df['h'] + df['h']*df['h'] + df['h']*df['h'] + df['h'] + # output gates
-                        df['p']*df['h'] # context layer
-                        )
+    df['num_params'] = compute_num_params(vocab_size, df['p'], df['h'])
     print(df)
     with PdfPages('output/capacity_vs_performance.pdf') as pdf:
         semcor_handle, = plt.plot(df['num_params'], df['semcor'], label='SemEval13 (T: SemCor)')
@@ -149,8 +154,18 @@ def draw_capacity_vs_performance_chart():
 #     print('Extrapolated data size:')
 #     print(lr.predict([[0.75], [0.8]]))
 
+def report_model_params():
+    v = DefaultConfig.vocab_size
+    models = [SmallConfig, H256P64, LargeConfig, GoogleConfig]
+    table = [['%.0fM' %(v/10**6), m.emb_dims, m.hidden_size, 
+              "%.0fM" %(compute_num_params(v, m.emb_dims, m.hidden_size)/10**6)]
+              for m in models]
+    df = pd.DataFrame(table, columns=['Vocab.', 'p', 'h', '#params'])
+    print(df.to_latex(index=False))
+
 if __name__ == '__main__':
 #     report_wsd_performance_vs_data_size()
 #     variation_experiment()
 #     draw_data_size_vs_performance_chart()
-    draw_capacity_vs_performance_chart()
\ No newline at end of file
+#     draw_capacity_vs_performance_chart()
+    report_model_params()
\ No newline at end of file
diff --git a/configs.py b/configs.py
index fecd147..89cd7cf 100644
--- a/configs.py
+++ b/configs.py
@@ -5,7 +5,6 @@
 os.makedirs(output_dir, exist_ok=True)
 
 gigaword_path = 'data/gigaword'
-preprocessed_gigaword_path = os.path.join('preprocessed-data', 'gigaword.txt')
 
 class DefaultConfig(object):
     vocab_size = 10**6 + 3
diff --git a/das5/prepare-lstm-wsd.job b/das5/prepare-lstm-wsd.job
new file mode 100755
index 0000000..0b21127
--- /dev/null
+++ b/das5/prepare-lstm-wsd.job
@@ -0,0 +1,4 @@
+#!/bin/bash
+#SBATCH --time=24:00:00
+
+python3 -u prepare-lstm-wsd.py
\ No newline at end of file
diff --git a/das5/process-gigaword.job b/das5/process-gigaword.job
old mode 100644
new mode 100755
index 8827673..b434649
--- a/das5/process-gigaword.job
+++ b/das5/process-gigaword.job
@@ -2,4 +2,3 @@
 #SBATCH --time=24:00:00
 
 python3 -u process-gigaword.py
-python3 -u prepare-lstm-wsd.py
\ No newline at end of file
diff --git a/diary-minh2.md b/diary-minh2.md
index 2456893..a097ede 100644
--- a/diary-minh2.md
+++ b/diary-minh2.md
@@ -260,6 +260,39 @@ job:
     [minhle@gcn40 wsd-dynamic-sense-vector]$ tail -f output/`python3 version.py`/exp-variation-score.job.out
     ...
 
+## Thu 7 Dec
+
+Worked on the paper. Data size 25% experiment has finished. Tried to run the
+newest evaluation script on it but no GPU machine is available yet.
+
+       42bc700..0a0d02b  master     -> origin/master
+    First, rewinding head to replay your work on top of it...
+    Fast-forwarded master to 0a0d02b4538dcf7322742e32e367a90ec1055899.
+    [minhle@int2 wsd-dynamic-sense-vector]$ sbatch cartesius/eval-data-size.job
+    Submitted batch job 3820439
+
+## Fri 19 Dec
+
+Meeting with Jacopo+Marten. Jacopo would like to retrain everything with <eos>
+token. Checked everything again. There doesn't seem to be big difference (that
+I don't know of) between the version that produced current reported results
+and a more recent version. Let's try.
+
+Added `<eos>` to the preparation script.
+
+I'll also need to add it to the evaluation scripts. 
+
+    >>> from collections import Counter
+    >>> c = Counter()
+    >>> with open('preprocessed-data/694cb4d/gigaword.txt') as f:
+          for sent in f:
+            c[sent.strip().split()[-1]] += 1
+    >>> c.most_common(10)
+    [('.', 141537114), ("''", 7066432), ('"', 7015844), (')', 2214057), ('_', 1964897), (':', 1605763), ('?', 1486728), ('--', 774285), ("'", 648803), ('...', 434971)]
+    >>> total = sum(c.values())
+    >>> [(tok, cnt/total) for tok, cnt in c.most_common(10)]
+    [('.', 0.8052320716307731), ("''", 0.04020230113211145), ('"', 0.039914496196088396), (')', 0.012596199360251295), ('_', 0.01117867983270516), (':', 0.00913549690604858), ('?', 0.008458283721904037), ('--', 0.004405057422483782), ("'", 0.00369116600590189), ('...', 0.002474634316970099)]
+
 
 TODO: docker image
         
@@ -271,8 +304,8 @@ TODO: docker image
 5. [x] for 25 Oct: list of all experiments for the reproduction paper
 6. [x] save models of every epoch (instead of only the best one)
 6. [x] Read more about label propagation (Zhou et al. 2004)
-7. [ ] Hyperparameter tuning of label propagation
+7. [x] Hyperparameter tuning of label propagation
 8. [ ] Training creates a lot of models, how to reduce it?
 9. [ ] Send code+data to Jacopo to run
-10. [ ] Polish the paper
+10. [x] Polish the arxiv paper
 11. [x] Use the same dev set for different sizes of the data.
\ No newline at end of file
diff --git a/difference-edited.txt b/difference-edited.txt
new file mode 100644
index 0000000..d39f15b
--- /dev/null
+++ b/difference-edited.txt
@@ -0,0 +1,1719 @@
+************************************************************************************
+
+This is a ***edited*** patch file that compare revision 354acc1cfdd542142490afe40447cb6f40d2fd7c (Jul 6, 2017),
+which produced our first h=2048,p=512 (code name "google") model, to a more recent
+revision, 3a24bb0560b41e435bae5215c5c5556d5542134f (Dec 6, 2017).
+
+The purpose is to identify any difference that could affect performance, should it
+goes down.
+
+************************************************************************************
+
+
+
+
+
+
+
+diff --git a/model.py b/model.py
+index 85f627a..b751d0c 100644
+--- a/model.py
++++ b/model.py
+@@ -1,15 +1,19 @@
+ import numpy as np
+ import tensorflow as tf
++import time
++import sys
++
++float_dtype = tf.float32
+ 
+ class DummyModelTrain(object):
+     '''
+     This is for testing GPU usage only. This model runs very trivial operations
+-    on GPU therefore its running time is mostly on CPU. Compared to WSDModelTrain,
++    on GPU therefore its running time is mostly on CPU. Compared to WSDModel,
+     this model should run much faster, otherwise you're spending too much time
+     on CPU.
+     '''
+ 
+-    def __init__(self, config, float_dtype):
++    def __init__(self, config):
+         self._x = tf.placeholder(tf.int32, shape=[None, None], name='x')
+         self._y = tf.placeholder(tf.int32, shape=[None], name='y')
+         self._subvocab = tf.placeholder(tf.int32, shape=[None], name='subvocab')
+@@ -39,42 +43,77 @@ class DummyModelTrain(object):
+     def print_device_placement(self):
+         pass
+ 
+-class WSDModelTrain(object):
++class WSDModel(object):
+     """A LSTM WSD model designed for fast training."""
+ 
++    def _build_inputs(self):
++        # the names are for later reference when the model is loaded
++        # they might be used or not, doesn't hurt
++        self._lens = tf.placeholder(tf.int32, shape=[None], name='lens')
+ 
++    def _build_word_embeddings(self):
+         E_words = tf.get_variable("word_embedding", 
+-                [config.vocab_size, config.emb_dims], dtype=float_dtype)
+-        outputs, _ = tf.nn.dynamic_rnn(cell, word_embs, dtype=float_dtype)
++                [self.config.vocab_size, self.config.emb_dims], dtype=float_dtype)
++
++    def _build_lstm_output(self):
++        if self.optimized and self.config.assume_same_lengths:
++            outputs, _ = tf.nn.dynamic_rnn(cell, self._word_embs, 
++                                           dtype=float_dtype)
++            self._lstm_output = outputs[:,-1]
++        else:
++            outputs, _ = tf.nn.dynamic_rnn(cell, self._word_embs, 
++                                           sequence_length=self._lens,
++                                           dtype=float_dtype)
++            last_output_indices = tf.stack([tf.range(tf.shape(self._x)[0]), self._lens-1], axis=1)
++            self._lstm_output = tf.gather_nd(outputs, last_output_indices)
++        self._initial_state = cell.zero_state(tf.shape(self._x)[0], float_dtype)
++
++    def _build_context_embs(self):
+         context_layer_weights = tf.get_variable("context_layer_weights",
+-                [config.hidden_size, config.emb_dims], dtype=float_dtype)
+-        self._predicted_context_embs = tf.matmul(outputs[:,-1], context_layer_weights, 
++                [self.config.hidden_size, self.config.emb_dims], dtype=float_dtype)
++        self._predicted_context_embs = tf.matmul(self._lstm_output, context_layer_weights, 
+                                                  name='predicted_context_embs')
++    
++    def _build_logits(self):
+         E_contexts = tf.get_variable("context_embedding", 
+-                [config.vocab_size, config.emb_dims], dtype=float_dtype)
+-        subcontexts = tf.nn.embedding_lookup(E_contexts, self._subvocab)
+-        pre_probs = tf.matmul(self._predicted_context_embs, tf.transpose(subcontexts))
+-        
++                [self.config.vocab_size, self.config.emb_dims], dtype=float_dtype)
++        if self.optimized and self.config.sampled_softmax:
++            subcontexts = tf.nn.embedding_lookup(E_contexts, self._subvocab)
++            self._logits = tf.matmul(self._predicted_context_embs, tf.transpose(subcontexts))
++        else:
++            self._logits = tf.matmul(self._predicted_context_embs, tf.transpose(E_contexts))
++    
++    def _build_cost(self):
+         self._cost = tf.reduce_mean(
+                 tf.nn.sparse_softmax_cross_entropy_with_logits(
+-                logits=pre_probs, labels=self._y))
+-
++                logits=self._logits, labels=self._y))
++        self._hit_at_100 = tf.reduce_mean(tf.cast(
++                tf.nn.in_top_k(self._logits, self._y, 100), float_dtype))
+         tvars = tf.trainable_variables()
+         grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars),
+-                                          config.max_grad_norm)
+-        optimizer = tf.train.AdagradOptimizer(config.learning_rate)
++                                          self.config.max_grad_norm)
++        optimizer = tf.train.AdagradOptimizer(self.config.learning_rate)
++        self._global_step = tf.contrib.framework.get_or_create_global_step()
+         self._train_op = optimizer.apply_gradients(zip(grads, tvars),
+-                global_step=tf.contrib.framework.get_or_create_global_step())
+-        self._initial_state = cell.zero_state(tf.shape(self._x)[0], float_dtype)
+-
+-        self.run_options = self.run_metadata = None
++                global_step=self._global_step)
+     
+     def trace_timeline(self):
+         self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+@@ -87,17 +126,20 @@ class WSDModelTrain(object):
+         
+         # resample the batches so that each token has equal chance to become target
+         # another effect is to randomize the order of batches
+-        sentence_lens = np.array([x.shape[1] for x, _, _, in data])
+-        samples = np.random.choice(len(data), size=len(data), 
+-                                   p=sentence_lens/sentence_lens.sum())
++        if self.config.optimized_batches:
++            sentence_lens = np.array([x.shape[1] for x, _, _, _ in data])
++            samples = np.random.choice(len(data), size=len(data), 
++                                       p=sentence_lens/sentence_lens.sum())
++        else:
++            samples = np.random.choice(len(data), size=len(data))
+         for batch_no, batch_id in enumerate(samples):
+-            x, y_all, subvocab = data[batch_id]
++            x, y_all, subvocab, lens = data[batch_id]
+             i =  np.random.randint(x.shape[1])
+             y = y_all[:,i]
+-            old_xi = x[:,i].copy()
++            old_xi = x[:,i].copy() # old_xi might be different from y because of subvocab
+             x[:,i] = target_id
+     
+-            feed_dict = {self._x: x, self._y: y, self._subvocab: subvocab}
++            feed_dict = {self._x: x, self._y: y, self._subvocab: subvocab, self._lens: lens}
+             state = session.run(self._initial_state, feed_dict)
+             c, h = self._initial_state
+             feed_dict[c] = state.c
+@@ -130,64 +172,18 @@ class WSDModelTrain(object):
+             sess.run(self._train_op, feed_dict)
+             print("******** End of device placement ********")
+ 
+
+
+
+
+
+diff --git a/perform_wsd.py b/perform_wsd.py
+index 6e14dc4..2a7a027 100644
+--- a/perform_wsd.py
++++ b/perform_wsd.py
+@@ -1,21 +1,85 @@
+ import numpy as np
++import os
+ import tensorflow as tf
++import json
+ import argparse
+ import pickle
+ import pandas
+ from nltk.corpus import wordnet as wn
++from nltk.corpus.reader.wordnet import WordNetCorpusReader
+ from scipy import spatial
++import morpho_utils
++import tensor_utils as utils
+ 
+ parser = argparse.ArgumentParser(description='Perform WSD using LSTM model')
+ parser.add_argument('-m', dest='model_path', required=True, help='path to model trained LSTM model')
+-# model_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/lstm-wsd-small'
+ parser.add_argument('-v', dest='vocab_path', required=True, help='path to LSTM vocabulary')
+-# vocab_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/gigaword.1m-sents-lstm-wsd.index.pkl'
+ parser.add_argument('-c', dest='wsd_df_path', required=True, help='input path to dataframe wsd competition')
++parser.add_argument('-l', dest='log_path', required=True, help='path where exp settings are stored')
+ parser.add_argument('-s', dest='sense_embeddings_path', required=True, help='path where sense embeddings are stored')
+ parser.add_argument('-o', dest='output_path', required=True, help='path where output wsd will be stored')
++parser.add_argument('-r', dest='results', required=True, help='path where accuracy will be reported')
++parser.add_argument('-g', dest='gran', required=True, help='sensekey | synset')
++parser.add_argument('-f', dest='mfs_fallback', required=True, help='True or False')
++parser.add_argument('-t', dest='path_case_freq', help='path to pickle with case freq')
++parser.add_argument('-a', dest='use_case_strategy', help='set to True to use morphological strategy case')
++parser.add_argument('-p', dest='path_plural_freq', help='path to pickle with plural freq')
++parser.add_argument('-b', dest='use_number_strategy', help='set to True to use morphological strategy number')
++parser.add_argument('-y', dest='path_lp', help='path to lp output')
++parser.add_argument('-z', dest='use_lp', help='set to True to use label propagation') 
++
++
+ args = parser.parse_args()
++args.mfs_fallback = args.mfs_fallback == 'True'
++case_strategy = args.use_case_strategy == 'True'
++number_strategy = args.use_number_strategy == 'True'
++lp_strategy = args.use_lp == 'True'
++
++case_freq = pickle.load(open(args.path_case_freq, 'rb'))
++plural_freq = pickle.load(open(args.path_plural_freq, 'rb'))
++lp_info = dict()
++
++the_wn_version = '30'
++# load relevant wordnet
++if '171' in args.wsd_df_path:
++    the_wn_version = '171'
++    cwd = os.path.dirname(os.path.realpath(__file__))
++    path_to_wn_dict_folder = os.path.join(cwd, 'scripts', 'wordnets', '171', 'WordNet-1.7.1', 'dict')
++    wn = WordNetCorpusReader(path_to_wn_dict_folder, None)
++
++
++with open(args.sense_embeddings_path + '.freq', 'rb') as infile:
++    meaning_freqs = pickle.load(infile)
++
++with open(args.log_path, 'w') as outfile:
++    json.dump(args.__dict__, outfile)
++
++
++def lp_output(row, lp_info, candidate_synsets, debug=False):
++    target_lemma = row['target_lemma']
++    target_pos = row['pos']
+ 
++    key = (target_lemma, target_pos)
++
++    if key not in lp_info:
++        if debug:
++            print(target_lemma, target_pos, 'not in lp_info')
++        return None
++
++    lp_index = row['lp_index']
++    if lp_index is None:
++        print('lp_index is None')
++        return None
++
++    sensekey = lp_info[(target_lemma, target_pos)][lp_index]
++    synset_identifier = None
++
++    for synset in candidate_synsets:
++        if any([lemma.key() == sensekey
++                for lemma in synset.lemmas()]):
++            synset_identifier = synset2identifier(synset, '30')
++
++    return synset_identifier
+ 
+ def synset2identifier(synset, wn_version):
+     """
+@@ -33,7 +97,7 @@ def synset2identifier(synset, wn_version):
+     offset_8_char = offset.zfill(8)
+ 
+     pos = synset.pos()
+-    if pos == 'j':
++    if pos in {'s', 'j'}:
+         pos = 'a'
+ 
+     identifier = 'eng-{wn_version}-{offset_8_char}-{pos}'.format_map(locals())
+@@ -64,14 +128,14 @@ def extract_sentence_wsd_competition(row):
+         sentence_tokens.append(sentence_token.text)
+ 
+     assert len(sentence_tokens) >= 2
+-    assert pos is not None
+-    assert lemma is not None
+-    assert target_index is not None
++    #assert pos is not None # only needed for sem2013-aw
++    #assert lemma is not None, (lemma, pos)
++    #assert target_index is not None
+ 
+     return target_index, sentence_tokens, lemma, pos
+ 
+ 
+-def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos):
++def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos, gran, synset2higher_level):
+     """
+     perform wsd
+ 
+@@ -85,30 +149,46 @@ def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instanc
+     """
+     highest_synsets = []
+     highest_conf = 0.0
++    candidate_freq = dict()
++    strategy = 'lstm'
++
++    for synset in candidate_synsets:
++        if gran == 'synset':
++            candidate = synset
++            candidate_freq[synset] = meaning_freqs[candidate]
++        elif gran in {'sensekey', 'blc20', 'direct_hypernym'}:
++            candidate = None
++            if synset in synset2higher_level:
++                candidate = synset2higher_level[synset]
++                candidate_freq[synset] = meaning_freqs[candidate]
++                candidate_freq[synset] = meaning_freqs[candidate]
+ 
+-    for candidate in candidate_synsets:
+         if candidate not in sense_embeddings:
+-            print('%s %s %s: candidate %s missing in sense embeddings' % (instance_id, lemma, pos, candidate))
++            #print('%s %s %s: candidate %s missing in sense embeddings' % (instance_id, lemma, pos, candidate))
+             continue 
+ 
+         cand_embedding = sense_embeddings[candidate]
+         sim = 1 - spatial.distance.cosine(cand_embedding, target_embedding)
+ 
+         if sim == highest_conf:
+-            highest_synsets.append(candidate)
++            highest_synsets.append(synset)
+         elif sim > highest_conf:
+-            highest_synsets = [candidate]
++            highest_synsets = [synset]
+             highest_conf = sim
+ 
+     if len(highest_synsets) == 1:
+         highest_synset = highest_synsets[0]
+     elif len(highest_synsets) >= 2:
+         highest_synset = highest_synsets[0]
+-        print('%s %s %s: 2> synsets with same conf %s: %s' % (instance_id, lemma, pos, highest_conf, highest_synsets))
++        #print('%s %s %s: 2> synsets with same conf %s: %s' % (instance_id, lemma, pos, highest_conf, highest_synsets))
+     else:
+-        highest_synset = None
+-        print('%s: no highest synset' % instance_id)
+-    return highest_synset
++        if args.mfs_fallback:
++            highest_synset = candidate_synsets[0]
++            #print('%s: no highest synset -> mfs' % instance_id)
++            strategy = 'mfs_fallback'
++        else:
++            highest_synset = None
++    return highest_synset, candidate_freq, strategy
+ 
+ 
+ # load wsd competition dataframe
+@@ -117,7 +197,11 @@ wsd_df = pandas.read_pickle(args.wsd_df_path)
+ # add output column
+ wsd_df['lstm_output'] = [None for _ in range(len(wsd_df))]
+ wsd_df['lstm_acc'] = [None for _ in range(len(wsd_df))]
+-
++wsd_df['emb_freq'] = [None for _ in range(len(wsd_df))]
++wsd_df['#_cand_synsets'] = [None for _ in range(len(wsd_df))]
++wsd_df['#_new_cand_synsets'] = [None for _ in range(len(wsd_df))]
++wsd_df['gold_in_new_cand_synsets'] = [None for _ in range(len(wsd_df))]
++wsd_df['wsd_strategy'] = [None for _ in range(len(wsd_df))]
+ 
+ # load sense embeddings
+ with open(args.sense_embeddings_path, 'rb') as infile:
+@@ -130,8 +214,9 @@ vocab = np.load(args.vocab_path)
+ with tf.Session() as sess:  # your session object
+     saver = tf.train.import_meta_graph(args.model_path + '.meta', clear_devices=True)
+     saver.restore(sess, args.model_path)
+-    predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0')
+-    x = sess.graph.get_tensor_by_name('Model/x:0')
++    x, predicted_context_embs, lens = utils.load_tensors(sess)
++    #predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0')
++    #x = sess.graph.get_tensor_by_name('Model/Placeholder:0')
+ 
+     for row_index, row in wsd_df.iterrows():
+         target_index, sentence_tokens, lemma, pos =  extract_sentence_wsd_competition(row)
+@@ -139,25 +224,100 @@ with tf.Session() as sess:  # your session object
+         target_id = vocab['<target>']
+         sentence_as_ids = [vocab.get(w) or vocab['<unkn>'] for w in sentence_tokens]
+         sentence_as_ids[target_index] = target_id
+-        target_embedding = sess.run(predicted_context_embs, {x: [sentence_as_ids]})[0]
+ 
+-        # load candidate synsets
+-        synsets = wn.synsets(lemma, pos=pos)
+-        candidate_synsets = {synset2identifier(synset, wn_version='30')
+-                             for synset in synsets}
++        target_embeddings = sess.run(predicted_context_embs, {x: [sentence_as_ids],
++                                                              lens: [len(sentence_as_ids)]})
++        for target_embedding in target_embeddings:
++            break
++
++        #target_embedding = sess.run(predicted_context_embs, {x: [sentence_as_ids]})[0]
++
++        # load token object
++        token_obj = row['tokens'][0]
++
++        # morphology reduced polysemy
++        pos = row['pos']
++        if the_wn_version in {'171'}:
++            pos = None
++       
++   
++        candidate_synsets, \
++        new_candidate_synsets, \
++        gold_in_candidates = morpho_utils.candidate_selection(wn,
++                                                              token=token_obj.text,
++                                                              target_lemma=row['target_lemma'],
++                                                              pos=row['pos'],
++                                                              morphofeat=token_obj.morphofeat,
++                                                              use_case=case_strategy,
++                                                              use_number=number_strategy,
++                                                              gold_lexkeys=row['lexkeys'],
++                                                              case_freq=case_freq,
++                                                              plural_freq=plural_freq,
++                                                              debug=False)
++
++        the_chosen_candidates = [synset2identifier(synset, wn_version=the_wn_version)
++                                 for synset in new_candidate_synsets]
++
++        print()
++        print(the_chosen_candidates, gold_in_candidates)
++        # get mapping to higher abstraction level
++        synset2higher_level = dict()
++        if args.gran in {'sensekey', 'blc20', 'direct_hypernym'}:
++            label = 'synset2%s' % args.gran
++            synset2higher_level = row[label]
++
++        # determine wsd strategy used
++        if len(candidate_synsets) == 1:
++            wsd_strategy = 'monosemous'
++        elif len(new_candidate_synsets) == 1:
++            wsd_strategy = 'morphology_solved'
++        elif len(candidate_synsets) == len(new_candidate_synsets):
++            wsd_strategy = 'lstm'
++        elif len(new_candidate_synsets) < len(candidate_synsets):
++            wsd_strategy = 'morphology+lstm'
++
++        # possibly include label propagation strategy
++        if lp_strategy:
++            lp_result = lp_output(row, lp_info, new_candidate_synsets, debug=False)
++
++            if lp_result:
++                the_chosen_candidates = [lp_result]
++                wsd_strategy = 'lp'
+ 
+         # perform wsd
+-        if len(candidate_synsets) >= 2:
+-            chosen_synset = score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos)
++        if len(the_chosen_candidates) >= 2:
++            chosen_synset, \
++            candidate_freq, \
++            strategy = score_synsets(target_embedding,
++                                     the_chosen_candidates,
++                                     sense_embeddings,
++                                     instance_id,
++                                     lemma,
++                                     pos,
++                                     args.gran,
++                                     synset2higher_level)
++
++            #if strategy == 'mfs_fallback':
++            #    wsd_strategy = 'mfs_fallback'
++
+         else:
+-            chosen_synset = candidate_synsets.pop()
++            chosen_synset = None
++            if the_chosen_candidates:
++            	chosen_synset = the_chosen_candidates[0]
++            candidate_freq = dict()
+ 
+         # add to dataframe
+         wsd_df.set_value(row_index, col='lstm_output', value=chosen_synset)
++        wsd_df.set_value(row_index, col='#_cand_synsets', value=len(candidate_synsets))
++        wsd_df.set_value(row_index, col='#_new_cand_synsets', value=len(new_candidate_synsets))
++        wsd_df.set_value(row_index, col='gold_in_new_cand_synsets', value=gold_in_candidates)
++        wsd_df.set_value(row_index, col='wsd_strategy', value=wsd_strategy)
+ 
+         # score it
+-        lstm_acc = chosen_synset in row['wn30_engs']
++        print(chosen_synset, row['source_wn_engs'])
++        lstm_acc = chosen_synset in row['source_wn_engs'] # used to be wn30_engs
+         wsd_df.set_value(row_index, col='lstm_acc', value=lstm_acc)
++        wsd_df.set_value(row_index, col='emb_freq', value=candidate_freq)        
+         
+         if lstm_acc:
+             num_correct += 1
+@@ -167,6 +327,9 @@ print(num_correct)
+ # save it
+ wsd_df.to_pickle(args.output_path)
+ 
++with open(args.results, 'w') as outfile:
++    outfile.write('%s' % num_correct)
++
+ 
+ 
+ 
+diff --git a/prepare-lstm-wsd.py b/prepare-lstm-wsd.py
+index f3e4d1b..52db01c 100644
+--- a/prepare-lstm-wsd.py
++++ b/prepare-lstm-wsd.py
+@@ -7,6 +7,7 @@ Read a simple text file (one sentence per line) and produce these files:
+ - <fname>.train.npz: training batches (each batch contains roughly the same
+ number of tokens but differing number of sentences depends on sentence length)
+ - <fname>.dev.npz: development dataset (as big as one epoch)
++- 
+ 
+ @author: Minh Le
+ '''
+@@ -20,23 +21,25 @@ import pickle
+ import re
+ import numpy as np
+ import subprocess
+-from tensorflow.contrib.labeled_tensor import batch
++from random import Random
++from collections import Counter
++from utils import progress, count_lines_fast
++from configs import preprocessed_gigaword_path, output_dir
++from version import version
+ 
+ dev_sents = 20000 # absolute maximum
+ dev_portion = 0.01 # relative maximum
+-batch_size = 128000 # words
++# if you get OOM (out of memory) error, reduce this number
++batch_size = 60000 # words
+ vocab_size = 10**6
+ min_count = 5
+ 
+-special_symbols = ['<target>', '<unkn>', '<pad>']
++inp_path = preprocessed_gigaword_path
++# inp_path = 'preprocessed-data/gigaword_1m-sents.txt' # for debugging    
++out_dir = os.path.join('preprocessed-data', version)
++out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd')
+ 
+-def progress(it):
+-    start = time()
+-    for i, val in enumerate(it):
+-        yield(val)
+-        if (i+1) % 1000000 == 0:
+-            sys.stderr.write('processed %d items, elapsed time: %.1f minutes...\n' 
+-                             %(i+1, (time()-start)/60))
++special_symbols = ['<target>', '<unkn>', '<pad>']
+ 
+ def _build_vocab(filename):
+     sys.stderr.write('Building vocabulary...\n')
+@@ -55,87 +58,146 @@ def _build_vocab(filename):
+     return word2id, words
+ 
+ def sort_sentences(inp_path, out_path):
++    start = time()
+     cmd = ('cat %s | python3 scripts/sentlen.py --min 6 --max 100 '
+-           '| sort -T output -k1,1g -k2 | uniq > %s'
+-           %(inp_path, out_path))
++           '| sort -T %s -k1,1g -k2 | uniq > %s'
++           %(inp_path, output_dir, out_path))
+     sys.stderr.write('%s\n' %cmd)
+     status = subprocess.call(cmd, shell=True)
++    sys.stderr.write('sorting finished after %.1f minutes...\n' %((time()-start)/60))
+     assert status == 0
+ 
+-def lookup_and_iter_sents(filename, word_to_id):
++def lookup_and_iter_sents(filename, word2id, include_ids=None, exclude_ids=None):
+     unkn_id = word2id['<unkn>']
+     with codecs.open(filename, 'r', 'utf-8') as f:
+-        for line in f:
+-            words = line.strip().split()
+-            yield [word_to_id.get(word) or unkn_id for word in words]
++        for sent_id, line in enumerate(f):
++            if ((include_ids is None or sent_id in include_ids) and 
++                (exclude_ids is None or sent_id not in exclude_ids)):
++                words = line.strip().split()
++                yield [word2id.get(word) or unkn_id for word in words]
+             
+-class PadFunc(object):
+-    
+-    dry_run=False
+-    
+-    def __init__(self):
+-        self.total = 0
+-        self.pads = 0
+-    def __call__(self, sents, max_len, pad_id):
+-        if self.dry_run:
+-            arr = np.empty(0)
+-            value_count = sum(1 for s in sents for _ in s)
+-            size = len(sents) * max_len
+-        else:
+-            arr = np.zeros((len(sents), max_len), dtype=np.int32)
+-            size = arr.size
+-            arr.fill(pad_id)
+-            value_count = 0
+-            for i, s in enumerate(sents):
+-                for j, v in enumerate(s):
+-                    arr[i,j] = v
+-                    value_count += 1
+-        self.pads += (size - value_count) 
+-        self.total += size
+-        return arr
+-
+-def pad_batches(inp_path, word2id):
++def pad(sents, max_len, pad_id):
++    arr = np.empty((len(sents), max_len), dtype=np.int32)
++    arr.fill(pad_id)
++    for i, s in enumerate(sents):
++        arr[i, :len(s)] = s
++    return arr
++
++def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1):
+     sys.stderr.write('Dividing and padding...\n')
+-    pad = PadFunc()
+     pad_id = word2id['<pad>']
+-    dev = []
+     batches = {}
+-    last_max_len = 0
+-    last_batch = []
+-    with open(inp_path) as f: total_sents = sum(1 for line in f)
+-    for sent in progress(lookup_and_iter_sents(inp_path, word2id)):
+-        if (len(dev) < dev_sents and len(dev) < dev_portion*total_sents 
+-                and np.random.rand() < 0.01):
+-            dev.append(sent)
+-        else:
+-            last_max_len = max(last_max_len, len(sent))
+-            last_batch.append(sent)
+-            if len(last_batch)*last_max_len >= batch_size:
+-                batches['batch%d' %len(batches)] = pad(last_batch, last_max_len, pad_id)
+-                last_max_len = 0
+-                last_batch = []
+-    if last_max_len > 0:
+-        batches['batch%d' %len(batches)] = pad(last_batch, last_max_len, pad_id)
+-    dev_lens = np.array([len(s) for s in dev], dtype=np.int32)
+-    dev_padded = PadFunc()(dev, max(dev_lens), pad_id)
++    sent_lens = []
++    curr_max_len = 0
++    curr_batch = []
++    batch_id = 0
++    for sent in progress(lookup_and_iter_sents(inp_path, word2id,
++                                               include_ids, exclude_ids)):
++        new_size = (len(curr_batch)+1) * max(curr_max_len,len(sent))
++        if new_size > batch_size or (max_sents > 0 and len(curr_batch) >= max_sents):
++            batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id)
++            batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32)
++            batch_id += 1
++            curr_max_len = 0
++            curr_batch = []
++        curr_max_len = max(curr_max_len, len(sent))
++        curr_batch.append(sent)
++        sent_lens.append(len(sent))
++    if curr_batch:
++        batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id)
++        batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32)
++        batch_id += 1 # important to count num batches correctly
++    sent_lens = np.array(sent_lens, dtype=np.int32)
+     sys.stderr.write('Dividing and padding... Done.\n')
+-    sizes = np.array([b.size for b in batches.values()])
+-    if len(batches) >= 2:
++    sizes = np.array([batches['batch%d'%i].size for i in range(batch_id)])
++    if batch_id >= 2:
+         sys.stderr.write('Divided into %d batches (%d elements each, std=%d, '
+                          'except last batch of %d).\n'
+-                         %(len(batches), sizes[:-1].mean(), sizes[:-1].std(), sizes[-1]))
++                         %(batch_id, sizes[:-1].mean(), sizes[:-1].std(), sizes[-1]))
+     else:
+-        assert len(batches) == 1
++        assert batch_id == 1
+         sys.stderr.write('Created 1 batch of %d elements.\n' %sizes[0])
+-    sys.stderr.write('Added %d elements as padding (%.2f%%).\n' 
+-                     %(pad.pads, pad.pads*100.0/pad.total))
+-    sys.stderr.write('Consumed roughly %.2f GiB.\n' 
+-                     %(pad.total*4/float(2**30)))
+-    return batches, dev_padded, dev_lens
++    sys.stderr.write('Sentence lengths: %.5f (std=%.5f)\n' 
++                     %(sent_lens.mean(), sent_lens.std()))
++    return batches
+ 
+-if __name__ == '__main__':
+-    inp_path, out_path = sys.argv[1:]
++
++def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids):
++    sys.stderr.write('Reading lengths...\n')
++    lens = []
++    with codecs.open(inp_path, 'r', 'utf-8') as f:
++        for line in progress(f, label='sentences'):
++            # this is different from counting the blank spaces because some words
++            # are separated by double spaces and there might be an additional
++            # whitespace at the end of a line
++            lens.append(len(line.strip().split()))
++    lens = np.array(lens, dtype=np.int32)
++    sys.stderr.write('Reading lengths... Done.\n')
++    
++    sys.stderr.write('Calculating batch shapes...\n')
++    indices = list(range(len(lens)))
++    rng = Random(29)
++    rng.shuffle(indices)
++    total_sents = len(lens)
++    batches = {}
++    curr_max_len = 0
++    curr_batch_lens = []
++    sent2batch = {}
++    batch_id = 0
++    for sent_id in progress(indices, label='sentences'):
++        l = lens[sent_id]
++        if sent_id not in dev_sent_ids:
++            new_size = (len(curr_batch_lens)+1) * max(curr_max_len,l)
++            if new_size >= batch_size:
++                batches['batch%d' %batch_id] = \
++                        np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32)
++                batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32)
++                batch_id += 1
++                curr_max_len = 0
++                curr_batch_lens = []
++            curr_max_len = max(curr_max_len, l)
++            curr_batch_lens.append(l)
++            sent2batch[sent_id] = 'batch%d' %batch_id
++    if curr_batch_lens:
++        batches['batch%d' %batch_id] = \
++                np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32)
++        batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32)
++        batch_id += 1 # important to count num batches correctly
++    sys.stderr.write('Calculating batch shapes... Done.\n')
+     
++    sys.stderr.write('Dividing and padding...\n')
++    pad_id = word2id['<pad>']
++    for i in range(batch_id): batches['batch%d'%i].fill(pad_id)
++    nonpad_count = 0
++    sent_counter = Counter()
++    for sent_id, sent in progress(enumerate(lookup_and_iter_sents(inp_path, word2id)), label='sentences'):
++        assert lens[sent_id] == len(sent)
++        batch_name = sent2batch.get(sent_id)
++        if batch_name is not None: # could be in dev set
++            batches[batch_name][sent_counter[batch_name],:len(sent)] = sent
++            nonpad_count += len(sent)
++            sent_counter[batch_name] += 1
++    # check that we filled all arrays
++    for batch_name in sent_counter:
++        assert sent_counter[batch_name] == batches[batch_name].shape[0]
++    sys.stderr.write('Dividing and padding... Done.\n')
++    
++    sizes = np.array([batches['batch%d'%i].size for i in range(batch_id)])
++    if batch_id >= 2:
++        sys.stderr.write('Divided into %d batches (%d elements each, std=%d, '
++                         'except last batch of %d).\n'
++                         %(batch_id, sizes[:-1].mean(), sizes[:-1].std(), sizes[-1]))
++    else:
++        assert batch_id == 1
++        sys.stderr.write('Created 1 batch of %d elements.\n' %sizes[0])
++    total = sum(sizes)
++    pad_count = total - nonpad_count
++    sys.stderr.write('Sentence lengths: %.5f (std=%.5f)\n' 
++                     %(lens.mean(), lens.std()))
++    return batches
++
++def run():
++    os.makedirs(out_dir, exist_ok=True)
+     index_path = out_path + '.index.pkl'
+     if os.path.exists(index_path):
+         sys.stderr.write('Reading vocabulary from %s... ' %index_path)
+@@ -146,17 +208,43 @@ if __name__ == '__main__':
+         word2id, words = _build_vocab(inp_path)
+         with open(index_path, 'wb') as f: pickle.dump(word2id, f)
+ 
+-    sorted_sents_path = inp_path + '.sorted'
++    sorted_sents_path = out_path + '.sorted'
+     if os.path.exists(sorted_sents_path):
+         sys.stderr.write('Sentences are already sorted at %s\n' %sorted_sents_path)
+     else:
+         sort_sentences(inp_path, sorted_sents_path)
++        
++    total_sents = count_lines_fast(sorted_sents_path)
++    real_num_dev_sents = int(min(dev_sents, dev_portion*total_sents))
++    np.random.seed(918)
++    dev_sent_ids = set(np.random.choice(total_sents, size=real_num_dev_sents, replace=False))
+     
+     train_path = out_path + '.train.npz'
+     dev_path = out_path + '.dev.npz'
+-    if os.path.exists(train_path):
+-        sys.stderr.write('Result already exists: %s. Skipped.\n' %train_path)
++    shuffled_train_path = out_path + '-shuffled.train.npz'
++    if os.path.exists(shuffled_train_path):
++        sys.stderr.write('Result already exists: %s. Skipped.\n' %shuffled_train_path)
+     else:
+-        batches, dev_data, dev_lens = pad_batches(sorted_sents_path, word2id)
++        print("- Training set:")
++        batches = pad_batches(sorted_sents_path, word2id, None, dev_sent_ids)
+         np.savez(train_path, **batches)
+-        np.savez(dev_path, data=dev_data, lens=dev_lens)
++        print("- Development set:")
++        batches = pad_batches(sorted_sents_path, word2id, dev_sent_ids, None, 768)
++        np.savez(dev_path, **batches)
++        print("- Shuffled training set:")
++        batches = shuffle_and_pad_batches(sorted_sents_path, word2id, dev_sent_ids)
++        np.savez(shuffled_train_path, **batches)
++            
++    for percent in (1, 10, 25, 50, 75):
++        num_lines = int(percent / 100.0 * total_sents)
++        sampled_ids = set(np.random.choice(total_sents, size=num_lines, replace=False))
++        pc_train_path = out_path + ('_%02d-pc.train.npz' %percent)
++        if os.path.exists(pc_train_path):
++            sys.stderr.write('%02d%% dataset already exists: %s. Skipped.\n' %pc_train_path)
++        else:
++            print("- Reduced training set (%02d%%):" %percent)
++            batches = pad_batches(sorted_sents_path, word2id, sampled_ids, dev_sent_ids)
++            np.savez(pc_train_path, **batches)
++
++if __name__ == '__main__':
++    run()
+diff --git a/process-gigaword.py b/process-gigaword.py
+index e568001..7bf2b0d 100644
+--- a/process-gigaword.py
++++ b/process-gigaword.py
+@@ -2,9 +2,17 @@ import os
+ import gzip
+ from bs4 import BeautifulSoup
+ import spacy
+-nlp = spacy.load('en_default')
++from configs import gigaword_path, preprocessed_gigaword_path
++import codecs
++from utils import progress
++from version import version
+ import sys
+ 
++def custom_pipeline(nlp):
++    return (nlp.tagger, nlp.parser)
++
++nlp = spacy.load('en_default', create_pipeline=custom_pipeline)
++
+ def iter_paragraphs(paths):
+     for path in paths:
+         with gzip.open(path) as f:
+@@ -13,7 +21,6 @@ def iter_paragraphs(paths):
+         paras = soup.find_all('p')
+         for p in paras: yield p.text.strip()
+ 
+-
+ def iter_files(root_dir):
+     for root, dirs, files in os.walk(root_dir):
+         for fname in files:
+@@ -21,21 +28,24 @@ def iter_files(root_dir):
+                 yield os.path.join(root, fname)
+ 
+ def iter_sents(paragraphs):
+-    for i, doc in enumerate(nlp.pipe(paragraphs, batch_size=10000, n_threads=32)):
+-        assert isinstance(doc, spacy.tokens.doc.Doc) and doc.is_parsed
++    for doc in nlp.pipe(paragraphs, batch_size=10000):
+         for sent in doc.sents:
+             yield [str(tok).strip() for tok in sent]
+-        if (i+1) % 10000 == 0:
+-            sys.stderr.write('%10d' %(i+1))
+-        if (i+1) % 100000 == 0:
+-            sys.stderr.write('\n')
+ 
+-gigaword_path = 'data/gigaword'
+-example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz'
++
++# example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz'
+ 
+ if __name__ == '__main__':
+-    for sent in iter_sents(iter_paragraphs(iter_files(gigaword_path))):
+-        for tok in sent:
+-            sys.stdout.write(tok)
+-            sys.stdout.write(' ')
+-        sys.stdout.write('\n')
++    dir_ = os.path.join('preprocessed-data', version)
++    os.makedirs(dir_, exist_ok=True)
++    preprocessed_gigaword_path = os.path.join(dir_, 'gigaword.txt')
++    sys.stderr.write('Writing to %s\n' %preprocessed_gigaword_path)
++    with codecs.open(preprocessed_gigaword_path, 'w', 'utf-8') as f:
++        paths = list(iter_files(gigaword_path))
++        paths.sort() # remove difference between machines
++        paths = progress(paths, ticks=1, label='files', max_=len(paths))
++        for sent in iter_sents(iter_paragraphs(paths)):
++            for tok in sent:
++                f.write(tok)
++                f.write(' ')
++            f.write('\n')
+diff --git a/scripts/semcor_format2LSTM_input.py b/scripts/semcor_format2LSTM_input.py
+index dfc369f..3990f55 100644
+--- a/scripts/semcor_format2LSTM_input.py
++++ b/scripts/semcor_format2LSTM_input.py
+@@ -4,7 +4,7 @@ from nltk.corpus import wordnet as wn
+ from lxml import html, etree
+ from collections import defaultdict
+ import wn_utils
+-
++from datetime import datetime
+ 
+ def get_lemma_pos_of_sensekey(sense_key):
+     """
+@@ -77,6 +77,7 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False):
+     :return: instance_id -> offset
+     """
+     instance_id2offset = dict()
++    instance_id2sensekeys = dict()
+ 
+     more_than_one_offset = 0
+     no_offsets = 0
+@@ -85,6 +86,8 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False):
+         for line in infile:
+             instance_id, *sensekeys = line.strip().split()
+ 
++            instance_id2sensekeys[instance_id] = sensekeys
++
+             offsets = {sensekey2offset[sensekey]
+                        for sensekey in sensekeys
+                        if sensekey in sensekey2offset}
+@@ -104,18 +107,26 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False):
+                 no_offsets += 1
+ 
+ 
+-    return instance_id2offset
++    return instance_id2offset, instance_id2sensekeys
+ 
+ 
+ # experiment settings
+ wn_version = '30'
+ corpora_to_include = ['semcor',
+-                         #'mun'
++                      #'mun'
+                       ]  # semcor | mun
+ 
+ accepted_pos = {'NOUN'}
+ entailment_setting = 'any_hdn'  # lemma_hdn | any_hdn
+-lemma2annotations = defaultdict(dict)
++#lemma2annotations = defaultdict(dict)
++
++
++
++#path_wn20_to_wn30 = '/Users/marten/Downloads/mappings-upc-2007/mapping-20-30/wn20-30.noun'
++#path_wn20_to_domain = '/Users/marten/git/semantic_class_manager/resources/wn-domains-3.2/wn-domains-3.2-20070223'
++#wn30_domain, domain_wn30 = wn_utils.get_synset2domain(path_wn20_to_domain,
++#                                                      path_wn20_to_wn30)
++
+ 
+ if wn_version == '30':
+     path_to_wn_dict_folder = str(wn._get_root()) # change this for other wn versions
+@@ -129,12 +140,15 @@ elif corpora_to_include == ['semcor']:
+     input_xml_path = '../data/WSD_Training_Corpora/SemCor/semcor.data.xml'
+     input_mapping_path = '../data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt'
+ 
++sensekey_output_path = 'sensekey-' + '_'.join(corpora_to_include) + '.txt'
+ synset_output_path = 'synset-' + '_'.join(corpora_to_include) + '.txt'
+ hdn_output_path = '-'.join(['hdn',
+                             '_'.join(corpora_to_include),
+                             '_'.join(accepted_pos),
+                             entailment_setting]) + '.txt'
+ 
++#domain_output_path = 'domain-' + '_'.join(corpora_to_include) + '.txt'
++#domain_mapping_path = domain_output_path + '.mapping'
+ 
+ # precompute all hdns
+ lemma_pos2offsets = wn_utils.load_lemma_pos2offsets(path_to_wn_index_sense)
+@@ -172,14 +186,18 @@ my_wn_reader = WordNetCorpusReader(path_to_wn_dict_folder, None)
+ sensekey2offset = load_mapping_sensekey2offset(path_to_wn_index_sense,
+                                                wn_version)
+ 
+-instance_id2offset = load_instance_id2offset(input_mapping_path,
+-                                             sensekey2offset,
+-                                             debug=False)
++instance_id2offset, instance_id2sensekeys = load_instance_id2offset(input_mapping_path,
++                                                                    sensekey2offset,
++                                                                    debug=False)
+ 
+ my_html_tree = html.parse(input_xml_path)
+ 
+-hdn_outfile = open(hdn_output_path, 'w')
++sensekey_outfile = open(sensekey_output_path, 'w')
+ synset_outfile = open(synset_output_path, 'w')
++#domain_outfile = open(domain_output_path, 'w')
++hdn_outfile = open(hdn_output_path, 'w')
++
++domain2freq = defaultdict(int)
+ 
+ for corpus_node in my_html_tree.xpath('body/corpus'):
+ 
+@@ -191,11 +209,16 @@ for corpus_node in my_html_tree.xpath('body/corpus'):
+         for sent_node in corpus_node.xpath('text/sentence'):
+ 
+             sentence_tokens = []
++            sensekey_annotations = []
+             synset_annotations = []
+             hdn_annotations = []
++            domain_annotations = []
+ 
+             for child_el in sent_node.getchildren():
+ 
++                if child_el.sourceline % 10000 == 0:
++                    print(child_el.sourceline, datetime.now())
++
+                 lemma = child_el.get('lemma')
+                 token = child_el.text
+                 pos = child_el.get('pos')
+@@ -204,8 +227,11 @@ for corpus_node in my_html_tree.xpath('body/corpus'):
+                 assert token is not None
+ 
+                 sentence_tokens.append(token)
++
++                sent_sensekey_annotations = []
+                 sent_synset_annotations = []
+                 sent_hdn_annotations = []
++                sent_domain_annotations = []
+ 
+                 if all([child_el.tag == 'instance',
+                         pos in accepted_pos]):
+@@ -214,13 +240,20 @@ for corpus_node in my_html_tree.xpath('body/corpus'):
+                     synset_id = instance_id2offset[instance_id]
+ 
+                     # update counter for logging purposes
+-                    if synset_id not in lemma2annotations[lemma]:
+-                        lemma2annotations[lemma][synset_id] = {'hdn': 0, 'synset': 0}
++                    #if synset_id not in lemma2annotations[lemma]:
++                    #    lemma2annotations[lemma][synset_id] = {'hdn': 0, 'synset': 0}
++                    #lemma2annotations[lemma][synset_id]['synset'] += 1
+ 
+-                    lemma2annotations[lemma][synset_id]['synset'] += 1
++                    sent_synset_annotations.append(synset_id)
+ 
++                    sensekeys = instance_id2sensekeys[instance_id]
++                    for sensekey in sensekeys:
++                        sent_sensekey_annotations.append(sensekey)
+ 
+-                    sent_synset_annotations.append(synset_id)
++                    #if synset_id in wn30_domain:
++                    #    domain = wn30_domain[synset_id]
++                    #    domain2freq[domain] += 1
++                    #    sent_domain_annotations.append(domain)
+ 
+                     # option lemma-based hdn
+                     if entailment_setting == 'lemma_hdn':
+@@ -234,21 +267,26 @@ for corpus_node in my_html_tree.xpath('body/corpus'):
+                             hdn = graph_info[synset_id]['under_lcs']
+ 
+                             if hdn is not None:
+-                                sent_hdn_annotations.append(hdn)
+-
+-                                lemma2annotations[lemma][synset_id]['hdn'] += 1
++                                sent_hdn_annotations.append('%s__%s' % (synset_id, hdn))
++                                #lemma2annotations[lemma][synset_id]['hdn'] += 1
+ 
+ 
+                     elif entailment_setting == 'any_hdn':
+                         hypernyms = sy_id2hypernyms[synset_id]
+                         for hypernym in hypernyms:
+                             if hypernym in all_hdns:
+-                                sent_hdn_annotations.append(hypernym)
+-
+-                                lemma2annotations[lemma][synset_id]['hdn'] += 1
++                                sent_hdn_annotations.append('%s_%s' % (synset_id, hypernym))
++                                #lemma2annotations[lemma][synset_id]['hdn'] += 1
+ 
++                sensekey_annotations.append(sent_sensekey_annotations)
+                 synset_annotations.append(sent_synset_annotations)
+                 hdn_annotations.append(sent_hdn_annotations)
++                #domain_annotations.append(sent_domain_annotations)
++
++
++            for sensekey_sentence in wn_utils.generate_training_instances(sentence_tokens,
++                                                                          sensekey_annotations):
++                sensekey_outfile.write(sensekey_sentence + '\n')
+ 
+             for synset_sentence in wn_utils.generate_training_instances(sentence_tokens,
+                                                                         synset_annotations):
+@@ -258,28 +296,39 @@ for corpus_node in my_html_tree.xpath('body/corpus'):
+                                                                      hdn_annotations):
+                 hdn_outfile.write(hdn_sentence + '\n')
+ 
+-hdn_outfile.close()
++            #for domain_sentence in wn_utils.generate_training_instances(sentence_tokens,
++            #                                                            domain_annotations):
++            #    domain_outfile.write(domain_sentence + '\n')
++
++
++sensekey_outfile.close()
+ synset_outfile.close()
++hdn_outfile.close()
++#domain_outfile.close()
+ 
+ 
+ per_lemma = []
+-per_synset = []
++per_sensekey = []
+ per_hdn = []
++synset2freq = defaultdict(int)
+ meanings = set()
+ 
+-for lemma, info in lemma2annotations.items():
+-    lemma_count = 0
+-    for sy_id, sy_info in info.items():
+-        lemma_count += sy_info['synset']
+-        per_synset.append(sy_info['synset'])
+-        per_hdn.append(sy_info['hdn'])
+-
+-        meanings.add(sy_id)
+-
+-    per_lemma.append(lemma_count)
+-
+-print('number of unique lemmas: %s' % len(lemma2annotations))
+-print('number of unique meanings: %s' % len(meanings))
+-print('min avg max lemma', min(per_lemma), round(sum(per_lemma) / len(per_lemma), 2), max(per_lemma))
+-print('min avg max synset', min(per_synset), round(sum(per_synset) / len(per_synset), 2), max(per_synset))
+-print('min avg max hdn', min(per_hdn), round(sum(per_hdn) / len(per_hdn), 2), max(per_hdn))
++#for lemma, info in lemma2annotations.items():
++#    lemma_count = 0
++#    for sy_id, sy_info in info.items():
++#        lemma_count += sy_info['synset']
++#        per_sensekey.append(sy_info['synset'])
++#        per_hdn.append(sy_info['hdn'])
++#
++#        synset2freq[sy_id] += sy_info['synset']
++#        meanings.add(sy_id)
++#
++#    per_lemma.append(lemma_count)
++
++#print('number of unique lemmas: %s' % len(lemma2annotations))
++#print('number of unique meanings: %s' % len(meanings))
++#print('# min avg max total lemma', len(per_lemma), min(per_lemma), round(sum(per_lemma) / len(per_lemma), 2), max(per_lemma), sum(per_lemma))
++#print('# min avg max total sensekey', len(per_sensekey), min(per_sensekey), round(sum(per_sensekey) / len(per_sensekey), 2), max(per_sensekey), sum(per_sensekey))
++#print('# min avg max total synset', len(synset2freq), min(synset2freq.values()), round(sum(synset2freq.values()) / len(synset2freq), 2), max(synset2freq.values()), sum(synset2freq.values()))
++#print('# min avg max total hdn', len(per_hdn), min(per_hdn), round(sum(per_hdn) / len(per_hdn), 2), max(per_hdn), sum(per_hdn))
++#print('# min avg max total domain', len(domain2freq), min(domain2freq.values()), round(sum(domain2freq.values()) / len(domain2freq), 2), max(domain2freq.values()), sum(domain2freq.values()))
+diff --git a/scripts/wn_utils.py b/scripts/wn_utils.py
+index a3f69bf..8acdd47 100644
+--- a/scripts/wn_utils.py
++++ b/scripts/wn_utils.py
+@@ -1,8 +1,154 @@
+-import nltk
+ import itertools
+ from collections import defaultdict
+ 
+ 
++def candidate_selection(wn,
++                        token,
++                        target_lemma,
++                        pos,
++                        use_case=False,
++                        use_number=False,
++                        gold_lexkeys=set(),
++                        case_freq=None,
++                        plural_freq=None,
++                        debug=False):
++    """
++    return candidate synsets of a token
++
++    :param str targe_lemma: a token, e.g. Congress
++    :param str pos: supported: n
++
++    :param bool use_case: if set to True,
++    only synsets are returned that contain the token in upper case
++    :param str gold_lexkeys: {'congress%1:14:00::'}
++
++    :rtype: tuple
++    :return: (candidate_synsets, 
++              new_candidate_synsets,
++              gold_in_candidates)
++    """
++    # assertions on input arguments
++    if use_case:
++        assert case_freq is not None, 'case_freq should not be None'
++
++    if use_number:
++        assert plural_freq is not None, 'plural_freq should not be None'
++
++    apply_morph_strategy = True
++
++    # check if candidate_synsets without morphological information is monosemous
++    candidate_synsets = wn.synsets(target_lemma, pos)
++    if len(candidate_synsets) == 1:
++        apply_morph_strategy = False
++
++    new_candidate_synsets = []
++    gold_in_candidates = False
++
++    if debug:
++        print(candidate_synsets)
++
++    for synset in candidate_synsets:
++
++        add = False
++
++        if all([use_number,
++                apply_morph_strategy]):
++
++            key = (target_lemma.lower(), pos)
++            lemma_plural_freq = dict()
++            if key in plural_freq:
++                lemma_plural_freq = plural_freq[(target_lemma.lower(), pos)]
++
++            plural_match = False
++            for lemma in synset.lemmas():
++                if lemma.key() in lemma_plural_freq:
++                    plural_match = True
++
++            if plural_match:
++                add = True
++
++        if all([use_case,
++                apply_morph_strategy]):
++
++            # check synset_lemma
++            capital_lemma_match = any([lemma.name() == token
++                                       for lemma in synset.lemmas()])
++
++            # check sense annotated corpus
++            key = (target_lemma.lower(), pos)
++            lemma_case_freq = dict()
++            if key in case_freq:
++                lemma_case_freq = case_freq[(target_lemma.lower(), pos)]
++
++            freq_match = False
++            for lemma in synset.lemmas():
++                if lemma.key() in lemma_case_freq:
++                    freq_match = True
++
++            if any([capital_lemma_match,  # whether lemma matches with token
++                    freq_match]):  # whether lemma of sensekey is used with capital
++                add = True
++
++        if add:
++            new_candidate_synsets.append(synset)
++
++            # check if gold in candidate
++            lexkeys = {lemma.key() for lemma in synset.lemmas()}
++            if any(gold_key in lexkeys
++                   for gold_key in gold_lexkeys):
++                gold_in_candidates = True
++
++    # if no synsets remain, use original ones
++    if not new_candidate_synsets:
++        new_candidate_synsets = candidate_synsets
++
++    return candidate_synsets, new_candidate_synsets, gold_in_candidates
++
++
++
++def get_synset2domain(path_wn20_to_domain,
++                      path_wn20_to_wn30):
++    """
++    create mapping between wn30 and domain and vice versa
++
++    :param str path_wn20_to_domain: wn-domains-3.2-20070223 file
++    :param str path_wn20_to_wn30: wn20-30.noun file from upc mappings
++
++    :rtype: tuple
++    :return: (wn30_domain, domain_wn30)
++    """
++    wn30_domain = dict()
++    domain_wn30 = defaultdict(set)
++
++    wn20_wn30 = dict()
++    with open(path_wn20_to_wn30) as infile:
++        for line in infile:
++            split = line.strip().split()
++            if len(split) == 3:
++                offset_20, *values = line.strip().split()
++                offset_30 = ''
++                conf = 0.0
++                for index in range(0, len(values), 2):
++                    an_offset = values[index]
++                    a_conf = float(values[index + 1])
++                    if a_conf > conf:
++                        offset_30 = an_offset
++                        conf = a_conf
++                wn20_wn30[offset_20 + '-n'] = offset_30 + '-n'
++
++    with open(path_wn20_to_domain) as infile:
++        for line in infile:
++            sy_id, domain = line.strip().split('\t')
++            if all([sy_id in wn20_wn30,
++                    sy_id.endswith('n')]):
++                wn30 = wn20_wn30[sy_id]
++
++                wn30_domain['eng-30-' + wn30] = domain
++                domain_wn30[domain].add('eng-30-' + wn30)
++
++    return wn30_domain, domain_wn30
++
++
+ def generate_training_instances(sentence_lemmas, annotations):
+     """
+     given the lemmas in a sentence with its annotations (can be more than one)
+@@ -37,6 +183,68 @@ def generate_training_instances(sentence_lemmas, annotations):
+     
+     return instances
+ 
++
++def generate_training_instances_v2(sentence_tokens,
++                                   sentence_lemmas,
++                                   sentence_pos,
++                                   annotations):
++    """
++    given the lemmas in a sentence with its annotations (can be more than one)
++    generate all training instances for that sentence
++
++    e.g. 
++    sentence_tokens = ['the', 'man',            'meets',   'women']
++    sentence_lemmas = ['the', 'man',            'meet',    'woman']
++    sentence_pos    = ['',    'n',              'v',       'n']
++    annotations =     [[],    ['1', '2' ],      ['4'],     ['5', '6']]
++
++    would result in
++    ('man', 'n', '1', ['the', 'man', 'meets', 'women'], 'the man---1 meets women', 1)
++    ('man', 'n', '2', ['the', 'man', 'meets', 'women'], 'the man---2 meets women', 1)
++    ('meet', 'v', '4', ['the', 'man', 'meets', 'women'], 'the man meets---4 women', 2)
++    ('woman', 'n', '5', ['the', 'man', 'meets', 'women'], 'the man meets women---5', 3)
++    ('woman', 'n', '6', ['the', 'man', 'meets', 'women'], 'the man meets women---6', 3)
++
++    :param list sentence_tokens: see above
++    :param list sentence_lemmas: see above
++    :param list sentence_pos: see above
++    :param list annotations: see above
++
++    :rtype: generator
++    :return: generator of (target_lemma, 
++                           target_pos, 
++                           token_annotation, 
++                           sentence_tokens, 
++                           training_example, 
++                           target_index)
++    """
++    for target_index, token_annotations in enumerate(annotations):
++
++        target_lemma = sentence_lemmas[target_index]
++        target_pos = sentence_pos[target_index]
++
++        for token_annotation in token_annotations:
++
++            if token_annotation is None:
++                continue
++
++            a_sentence = []
++            for index, token in enumerate(sentence_tokens):
++
++                if index == target_index:
++                    a_sentence.append(token + '---' + token_annotation)
++                else:
++                    a_sentence.append(token)
++
++            training_example = ' '.join(a_sentence)
++
++            yield (target_lemma,
++                   target_pos,
++                   token_annotation,
++                   sentence_tokens,
++                   training_example,
++                   target_index)
++
+ def load_lemma_pos2offsets(path_to_index_sense):
+     '''
+     given with index.sense from wordnet distributions such as
+@@ -157,8 +365,11 @@ def synsets_graph_info(wn_instance, wn_version, lemma, pos):
+     synsets = set(synsets)
+ 
+     if len(synsets) == 1:
+-        target_sy_iden = synset2identifier(synsets.pop(), wn_version)
++        sy_obj = synsets.pop()
++        target_sy_iden = synset2identifier(sy_obj, wn_version)
+         sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': None,
++                                                'under_lcs_obj': None,
++                                                'sy_obj' : sy_obj,
+                                                 'path_to_under_lcs': []}
+         return sy_id2under_lcs_info
+ 
+@@ -199,6 +410,28 @@ def synsets_graph_info(wn_instance, wn_version, lemma, pos):
+                                                for synset in path_to_under_lcs]
+ 
+                     sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': under_lcs_iden,
++                                                            'under_lcs_obj': under_lcs,
++                                                            'sy_obj' : sy1,
+                                                             'path_to_under_lcs': path_to_under_lcs_idens}
+ 
+     return sy_id2under_lcs_info
++
++
++def get_synset2sensekeys(wn, target_lemma, pos):
++    """
++
++    :param str target_lemma: e.g. cat
++    :param str pos: n v a r
++
++    :rtype: dict
++    :return: mapping from synset identifier -> sensekey
++
++    """
++    synset2sensekeys = dict()
++    for synset in wn.synsets(target_lemma, pos):
++        sy_id = synset2identifier(synset, '30')
++        for lemma in synset.lemmas():
++            if lemma.key().startswith(target_lemma + '%'):
++                synset2sensekeys[sy_id] = lemma.key()
++
++    return synset2sensekeys
+diff --git a/test-lstm.py b/test-lstm.py
+index 80d9033..85ab1c5 100644
+--- a/test-lstm.py
++++ b/test-lstm.py
+@@ -3,17 +3,20 @@ import tensorflow as tf
+ from collections import defaultdict 
+ import argparse
+ import pickle
++from datetime import datetime
+ 
+ parser = argparse.ArgumentParser(description='Trains meaning embeddings based on precomputed LSTM model')
+ parser.add_argument('-m', dest='model_path', required=True, help='path to model trained LSTM model')
+-# model_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/lstm-wsd-small'
++# model_path = 'output/lstm-wsd-small'
+ parser.add_argument('-v', dest='vocab_path', required=True, help='path to LSTM vocabulary')
+-#vocab_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/gigaword.1m-sents-lstm-wsd.index.pkl'
++#vocab_path = 'gigaword.1m-sents-lstm-wsd.index.pkl'
+ parser.add_argument('-i', dest='input_path', required=True, help='input path with sense annotated sentences')
+ parser.add_argument('-o',dest='output_path', required=True, help='path where sense embeddings will be stored')
+ parser.add_argument('-t', dest='max_lines', required=True, help='maximum number of lines you want to train on')
+ args = parser.parse_args()
+ 
++print('loaded arguments for training meaning embeddings')
++
+ def ctx_embd_input(sentence):
+     """
+     given a annotated sentence, return
+@@ -39,21 +42,31 @@ def ctx_embd_input(sentence):
+     return tokens, annotation_indices    
+     
+ vocab = np.load(args.vocab_path)
++print('loaded vocab')
++
+ synset2context_embds = defaultdict(list)
+-  
++meaning_freqs = defaultdict(int)
++
+ with tf.Session() as sess:  # your session object
+     saver = tf.train.import_meta_graph(args.model_path + '.meta', clear_devices=True)
+     saver.restore(sess, args.model_path)
+     predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0')
+-    x = sess.graph.get_tensor_by_name('Model/x:0')
++    x = sess.graph.get_tensor_by_name('Model/Placeholder:0') 
+ 
+     with open(args.input_path) as infile:
+         for counter, line in enumerate(infile):
+             if counter >= int(args.max_lines):
+                 break
++            if counter % 1000 == 0:
++                print(counter, datetime.now())
+             sentence = line.strip()
+             tokens, annotation_indices = ctx_embd_input(sentence)
+             for index, synset_id in annotation_indices:
++                
++                #if '_' in synset_id:
++                #    base_synset, synset_id = synset_id.split('_')
++
++                meaning_freqs[synset_id] += 1
+                 target_id = vocab['<target>']
+                 sentence_as_ids = [vocab.get(w) or vocab['<unkn>'] for w in tokens]
+                 sentence_as_ids[index] = target_id
+@@ -68,3 +81,6 @@ for synset, embeddings in synset2context_embds.items():
+ 
+ with open(args.output_path, 'wb') as outfile:
+     pickle.dump(synset2avg_embedding, outfile)
++
++with open(args.output_path + '.freq', 'wb') as outfile:
++    pickle.dump(meaning_freqs, outfile)
+
+
+
+
+
+
+
+
+diff --git a/das5/train-lstm-wsd-full-data-google-model.job b/das5/train-lstm-wsd-full-data-google-model.job
+new file mode 100755
+index 0000000..3c35b70
+--- /dev/null
++++ b/das5/train-lstm-wsd-full-data-google-model.job
+@@ -0,0 +1,17 @@
++#!/bin/bash
++#SBATCH --time=72:00:00
++#SBATCH -C TitanX
++#SBATCH --gres=gpu:1
++
++module load cuda80/toolkit
++module load cuda80/blas
++module load cuda80
++module load cuDNN
++
++echo -n 'Started: ' && date
++
++python3 -u train-lstm-wsd.py --model google \
++        --data_path output/gigaword-lstm-wsd \
++        --save_path output/lstm-wsd-gigaword-google
++
++echo -n 'Finished: ' && date
+diff --git a/das5/train-lstm-wsd-full-data-large-model.job b/das5/train-lstm-wsd-full-data-large-model.job
+new file mode 100755
+index 0000000..aca457d
+--- /dev/null
++++ b/das5/train-lstm-wsd-full-data-large-model.job
+@@ -0,0 +1,18 @@
++#!/bin/bash
++#SBATCH --time=72:00:00
++#SBATCH -C TitanX
++#SBATCH --gres=gpu:1
++
++module load cuda80/toolkit
++module load cuda80/blas
++module load cuda80
++module load cuDNN
++
++echo -n 'Started: ' && date
++
++python3 -u train-lstm-wsd.py --model large \
++        --data_path output/gigaword-lstm-wsd \
++        --save_path output/lstm-wsd-gigaword-large
++
++echo -n 'Finished: ' && date
++    
+\ No newline at end of file
+
+
+
+
+
+diff --git a/train-lstm-wsd-full-data-google-model.job b/train-lstm-wsd-full-data-google-model.job
+deleted file mode 100755
+index 3c35b70..0000000
+--- a/train-lstm-wsd-full-data-google-model.job
++++ /dev/null
+@@ -1,17 +0,0 @@
+-#!/bin/bash
+-#SBATCH --time=72:00:00
+-#SBATCH -C TitanX
+-#SBATCH --gres=gpu:1
+-
+-module load cuda80/toolkit
+-module load cuda80/blas
+-module load cuda80
+-module load cuDNN
+-
+-echo -n 'Started: ' && date
+-
+-python3 -u train-lstm-wsd.py --model google \
+-        --data_path output/gigaword-lstm-wsd \
+-        --save_path output/lstm-wsd-gigaword-google
+-
+-echo -n 'Finished: ' && date
+diff --git a/train-lstm-wsd-full-data-large-model.job b/train-lstm-wsd-full-data-large-model.job
+deleted file mode 100755
+index aca457d..0000000
+--- a/train-lstm-wsd-full-data-large-model.job
++++ /dev/null
+@@ -1,18 +0,0 @@
+-#!/bin/bash
+-#SBATCH --time=72:00:00
+-#SBATCH -C TitanX
+-#SBATCH --gres=gpu:1
+-
+-module load cuda80/toolkit
+-module load cuda80/blas
+-module load cuda80
+-module load cuDNN
+-
+-echo -n 'Started: ' && date
+-
+-python3 -u train-lstm-wsd.py --model large \
+-        --data_path output/gigaword-lstm-wsd \
+-        --save_path output/lstm-wsd-gigaword-large
+-
+-echo -n 'Finished: ' && date
+-    
+\ No newline at end of file
+diff --git a/train-lstm-wsd.py b/train-lstm-wsd.py
+index f6871ee..6131dc9 100644
+--- a/train-lstm-wsd.py
++++ b/train-lstm-wsd.py
+@@ -14,149 +14,47 @@ import numpy as np
+ import tensorflow as tf
+ from tensorflow.python.client import timeline
+ import sys
+-from model import WSDModelTrain, WSDModelEvaluate, DummyModelTrain
++from model import WSDModel, train_model
++from configs import get_config
++import random
+ 
+ flags = tf.flags
+ logging = tf.logging
+ 
++flags.DEFINE_integer("seed", 192, 
++                     "A random seed to make sure the experiment is repeatable")
+ flags.DEFINE_string("model", "small",
+-    "A type of model. Possible options are: small, medium, large, google.")
++                    "A type of model. Possible options are: small, medium, large, google.")
+ flags.DEFINE_string("data_path", None,
+-                    "Where the training/test data is stored.")
++                    "Where the training/valid data is stored.")
++flags.DEFINE_string("dev_path", '',
++                    "Where the valid data is stored, if it cannot be inferred from data_path.")
++flags.DEFINE_string("vocab_path", '',
++                    "Where the vocabulary is stored, if it cannot be inferred from data_path.")
+ flags.DEFINE_string("save_path", None,
+                     "Model output directory.")
+-flags.DEFINE_bool("use_fp16", False,
+-                  "Train using 16-bit floats instead of 32bit floats")
+ flags.DEFINE_bool("trace_timeline", False,
+                   "Trace execution time to find out bottlenecks.")
+ FLAGS = flags.FLAGS
+ 
+-
+-def data_type():
+-  return tf.float16 if FLAGS.use_fp16 else tf.float32
+-
+-
+-class SmallConfig(object):
+-  """Small config."""
+-  init_scale = 0.1
+-  learning_rate = 0.1
+-  max_grad_norm = 5
+-  hidden_size = 100
+-  max_epoch = 100
+-  emb_dims = 10
+-
+-
+-class MediumConfig(object):
+-  """Medium config."""
+-  init_scale = 0.05
+-  learning_rate = 0.1
+-  max_grad_norm = 5
+-  hidden_size = 200
+-  max_epoch = 500
+-  emb_dims = 100
+-
+-
+-class LargeConfig(object):
+-  """Large config."""
+-  init_scale = 0.04
+-  learning_rate = 0.1
+-  max_grad_norm = 10
+-  hidden_size = 512
+-  max_epoch = 1000
+-  emb_dims = 128
+-
+-
+-class GoogleConfig(object):
+-  """Large config."""
+-  init_scale = 0.04
+-  learning_rate = 0.1
+-  max_grad_norm = 5
+-  hidden_size = 2048
+-  max_epoch = 2000
+-  emb_dims = 512
+-
+-
+-class TestConfig(object):
+-  """Tiny config, for testing."""
+-  init_scale = 0.1
+-  learning_rate = 0.1
+-  max_grad_norm = 1
+-  hidden_size = 2
+-  max_epoch = 1
+-  batch_size = 20
+-
+-def get_config():
+-  if FLAGS.model == "small":
+-    return SmallConfig()
+-  elif FLAGS.model == "medium":
+-    return MediumConfig()
+-  elif FLAGS.model == "large":
+-    return LargeConfig()
+-  elif FLAGS.model == "google":
+-    return GoogleConfig()
+-  elif FLAGS.model == "test":
+-    return TestConfig()
+-  else:
+-    raise ValueError("Invalid model: %s", FLAGS.model)
+-    
+-def load_data():
+-    sys.stderr.write('Loading data...\n')
+-    full_vocab = np.load(FLAGS.data_path + '.index.pkl')
+-    train = np.load(FLAGS.data_path + '.train.npz')
+-    train_batches = []
+-    num_batches = len(train.keys())
+-    for i in range(num_batches):
+-        sentences = train['batch%d' %i]
+-        batch_vocab, inverse = np.unique(sentences, return_inverse=True)
+-        outputs = inverse.reshape(sentences.shape)
+-        sys.stderr.write('Batch %d of %d vocab size: %d (%.2f%% of original)\n'
+-                         %(i, num_batches, batch_vocab.size, batch_vocab.size*100.0/len(full_vocab)))
+-        train_batches.append((sentences, outputs, batch_vocab))
+-    dev = np.load(FLAGS.data_path + '.dev.npz')
+-    sys.stderr.write('Loading data... Done.\n')
+-    return full_vocab, train_batches, dev['data'], dev['lens']
+-
+ def main(_):
++    random.seed(FLAGS.seed)
++    np.random.seed(random.randint(0, 10**6))
++    tf.set_random_seed(random.randint(0, 10**6))
+     if not FLAGS.data_path:
+         raise ValueError("Must set --data_path to the base path of "
+                          "prepared input (e.g. output/gigaword)")
+-    vocab, train_batches, dev_data, dev_lens = load_data()
+-    target_id = vocab['<target>']    
+-    config = get_config()
+-    config.vocab_size = len(vocab)
++    config = get_config(FLAGS)
+     with tf.Graph().as_default():
+         initializer = tf.random_uniform_initializer(-config.init_scale,
+                                                     config.init_scale)
+     with tf.variable_scope("Model", reuse=None, initializer=initializer):
+-        m_train = WSDModelTrain(config, data_type())
+-    with tf.variable_scope("Model", reuse=True, initializer=initializer):
+-        m_evaluate = WSDModelEvaluate(config, data_type())
+-    m_train.print_device_placement()
+-    with tf.Session() as session:
+-        saver = tf.train.Saver()
+-        start_time = time.time()
+-        sys.stdout.write("Initializing variables.... ")
+-        session.run(tf.global_variables_initializer())
+-        sys.stdout.write("Done.\n")
+-        best_cost = None
+-        for i in range(config.max_epoch):
+-            # only turn it on after 5 epochs because first epochs spend time 
+-            # on GPU initialization routines
+-            if FLAGS.trace_timeline and i == 5: 
+-                m_train.trace_timeline() # start tracing timeline
+-            print("Epoch #%d:" % (i + 1))
+-#             train_cost = 0 # for debugging
+-            train_cost = m_train.train_epoch(session, train_batches, target_id, verbose=True)
+-            dev_cost, hit_at_100 = m_evaluate.measure_dev_cost(session, dev_data, dev_lens, target_id)
+-            print("Epoch #%d finished:" %(i + 1))
+-            print("\tTrain cost: %.3f" %train_cost)
+-            print("\tDev cost: %.3f, hit@100: %.1f%%" %(dev_cost, hit_at_100))
+-            if best_cost is None or dev_cost < best_cost:
+-                best_cost = dev_cost
+-#                 save_start = time.time()
+-                print("\tSaved best model to %s" %saver.save(session, FLAGS.save_path))
+-#                 print("\tTime on saving: %f sec" %(time.time()-save_start))
+-            print("\tElapsed time: %.1f minutes" %((time.time()-start_time)/60))
++        m_train = WSDModel(config, optimized=True)
++    with tf.variable_scope("Model", reuse=True):
++        m_evaluate = WSDModel(config, reuse_variables=True)
++#     m_train.print_device_placement() # for debugging
++    train_model(m_train, m_evaluate, FLAGS, config)
++
+     if FLAGS.trace_timeline:
+         tl = timeline.Timeline(m_train.run_metadata.step_stats)
+         ctf = tl.generate_chrome_trace_format()
\ No newline at end of file
diff --git a/prepare-lstm-wsd.py b/prepare-lstm-wsd.py
index 52db01c..2bb3fe6 100644
--- a/prepare-lstm-wsd.py
+++ b/prepare-lstm-wsd.py
@@ -24,7 +24,7 @@
 from random import Random
 from collections import Counter
 from utils import progress, count_lines_fast
-from configs import preprocessed_gigaword_path, output_dir
+from configs import output_dir
 from version import version
 
 dev_sents = 20000 # absolute maximum
@@ -34,12 +34,7 @@
 vocab_size = 10**6
 min_count = 5
 
-inp_path = preprocessed_gigaword_path
-# inp_path = 'preprocessed-data/gigaword_1m-sents.txt' # for debugging    
-out_dir = os.path.join('preprocessed-data', version)
-out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd')
-
-special_symbols = ['<target>', '<unkn>', '<pad>']
+special_symbols = ['<target>', '<unkn>', '<pad>', '<eos>']
 
 def _build_vocab(filename):
     sys.stderr.write('Building vocabulary...\n')
@@ -76,16 +71,17 @@ def lookup_and_iter_sents(filename, word2id, include_ids=None, exclude_ids=None)
                 words = line.strip().split()
                 yield [word2id.get(word) or unkn_id for word in words]
             
-def pad(sents, max_len, pad_id):
-    arr = np.empty((len(sents), max_len), dtype=np.int32)
+def pad(sents, max_len, pad_id, eos_id):
+    arr = np.empty((len(sents), max_len+1), dtype=np.int32)
     arr.fill(pad_id)
     for i, s in enumerate(sents):
         arr[i, :len(s)] = s
+        arr[i, len(s)] = eos_id
     return arr
 
 def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1):
     sys.stderr.write('Dividing and padding...\n')
-    pad_id = word2id['<pad>']
+    eos_id, pad_id = word2id['<eos>'], word2id['<pad>']
     batches = {}
     sent_lens = []
     curr_max_len = 0
@@ -95,7 +91,7 @@ def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1):
                                                include_ids, exclude_ids)):
         new_size = (len(curr_batch)+1) * max(curr_max_len,len(sent))
         if new_size > batch_size or (max_sents > 0 and len(curr_batch) >= max_sents):
-            batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id)
+            batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id, eos_id)
             batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32)
             batch_id += 1
             curr_max_len = 0
@@ -104,7 +100,7 @@ def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1):
         curr_batch.append(sent)
         sent_lens.append(len(sent))
     if curr_batch:
-        batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id)
+        batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id, eos_id)
         batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32)
         batch_id += 1 # important to count num batches correctly
     sent_lens = np.array(sent_lens, dtype=np.int32)
@@ -150,7 +146,7 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids):
             new_size = (len(curr_batch_lens)+1) * max(curr_max_len,l)
             if new_size >= batch_size:
                 batches['batch%d' %batch_id] = \
-                        np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32)
+                        np.empty((len(curr_batch_lens), max(curr_batch_lens)+1), dtype=np.int32)
                 batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32)
                 batch_id += 1
                 curr_max_len = 0
@@ -160,13 +156,13 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids):
             sent2batch[sent_id] = 'batch%d' %batch_id
     if curr_batch_lens:
         batches['batch%d' %batch_id] = \
-                np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32)
+                np.empty((len(curr_batch_lens), max(curr_batch_lens)+1), dtype=np.int32)
         batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32)
         batch_id += 1 # important to count num batches correctly
     sys.stderr.write('Calculating batch shapes... Done.\n')
     
     sys.stderr.write('Dividing and padding...\n')
-    pad_id = word2id['<pad>']
+    eos_id, pad_id = word2id['<eos>'], word2id['<pad>']
     for i in range(batch_id): batches['batch%d'%i].fill(pad_id)
     nonpad_count = 0
     sent_counter = Counter()
@@ -174,7 +170,8 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids):
         assert lens[sent_id] == len(sent)
         batch_name = sent2batch.get(sent_id)
         if batch_name is not None: # could be in dev set
-            batches[batch_name][sent_counter[batch_name],:len(sent)] = sent
+            batches[batch_name][sent_counter[batch_name], :len(sent)] = sent
+            batches[batch_name][sent_counter[batch_name], len(sent)] = eos_id
             nonpad_count += len(sent)
             sent_counter[batch_name] += 1
     # check that we filled all arrays
@@ -196,8 +193,7 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids):
                      %(lens.mean(), lens.std()))
     return batches
 
-def run():
-    os.makedirs(out_dir, exist_ok=True)
+def run(inp_path, out_path):
     index_path = out_path + '.index.pkl'
     if os.path.exists(index_path):
         sys.stderr.write('Reading vocabulary from %s... ' %index_path)
@@ -247,4 +243,9 @@ def run():
             np.savez(pc_train_path, **batches)
 
 if __name__ == '__main__':
-    run()
+    inp_path = 'preprocessed-data/694cb4d/gigaword.txt'
+    #inp_path = 'preprocessed-data/694cb4d/gigaword_1m-sents.txt' # for debugging    
+    out_dir = os.path.join('preprocessed-data', version)
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd')
+    run(inp_path, out_path)
diff --git a/process-gigaword.py b/process-gigaword.py
index 7bf2b0d..10416ff 100644
--- a/process-gigaword.py
+++ b/process-gigaword.py
@@ -2,7 +2,7 @@
 import gzip
 from bs4 import BeautifulSoup
 import spacy
-from configs import gigaword_path, preprocessed_gigaword_path
+from configs import gigaword_path
 import codecs
 from utils import progress
 from version import version
@@ -32,7 +32,6 @@ def iter_sents(paragraphs):
         for sent in doc.sents:
             yield [str(tok).strip() for tok in sent]
 
-
 # example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz'
 
 if __name__ == '__main__':
diff --git a/stats_gigaword.py b/stats_gigaword.py
index 193b678..a3b7c49 100644
--- a/stats_gigaword.py
+++ b/stats_gigaword.py
@@ -1,4 +1,4 @@
-from configs import preprocessed_gigaword_path, output_dir
+from configs import output_dir
 from collections import Counter
 from nltk.stem import WordNetLemmatizer
 import codecs
@@ -9,6 +9,7 @@
     token_count = Counter()
     lemma_count = Counter()
     wordnet_lemmatizer = WordNetLemmatizer()
+    preprocessed_gigaword_path = 'preprocessed-data/694cb4d/gigaword.txt'
     with codecs.open(preprocessed_gigaword_path, 'r', 'utf-8') as f:
         for line_no, line in enumerate(f):
             for tok in line.split():