From 36b62464fb96b75745a52ed6e0cb19cc9645b427 Mon Sep 17 00:00:00 2001 From: Minh Le Date: Fri, 19 Jan 2018 23:31:27 +0100 Subject: [PATCH] add to prepare-lstm-wsd --- README.md | 27 +- compile_results.py | 31 +- configs.py | 1 - das5/prepare-lstm-wsd.job | 4 + das5/process-gigaword.job | 1 - diary-minh2.md | 37 +- difference-edited.txt | 1719 +++++++++++++++++++++++++++++++++++++ prepare-lstm-wsd.py | 39 +- process-gigaword.py | 3 +- stats_gigaword.py | 3 +- 10 files changed, 1824 insertions(+), 41 deletions(-) create mode 100755 das5/prepare-lstm-wsd.job mode change 100644 => 100755 das5/process-gigaword.job create mode 100644 difference-edited.txt diff --git a/README.md b/README.md index c5864cd..17b2fb3 100644 --- a/README.md +++ b/README.md @@ -104,11 +104,24 @@ This creates a development set for the label propagation: a) annotated corpus: pwgc b) unannotated corpus: omsti -#### Reproduce variation experiment +#### Model size experiements -0. `git checkout a453bc1` -1. Pre-process GigaWord into plain text: `sbatch cartesius/process-gigaword.job` -2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job` +Notice that there was uncertainty about the real version that produce h2048p512 +and h512p128, see `difference-edited.txt` for a comparison with a recent version. + +1. h=2048, p=512: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-google-model.job` +2. h=512, p=128: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-large-model.job` +3. h=512, p=64: see `exp-h256p64.sh` in "stability" section +4. h=100, p=10: see `exp-variation*.job` in "stability" section + +#### Reproduce variation/stability experiments + +These experiments measure how much the performance is affected by the randomness +in training. Basically, we train smaller models many times, each time with +a different (but fixed) random seed. + +1. Pre-process GigaWord into plain text: `git checkout 2b0934c && sbatch cartesius/process-gigaword.job` +2. More preprocessing to make binary files: `git checkout a453bc1 && sbatch cartesius/prepare-lstm-wsd.job` 0. `git checkout ce8a024` 1. Run at the same time: `sbatch cartesius/exp-variation1.job` and `cartesius/sbatch exp-variation2.job` 0. `git checkout a74bda6` @@ -118,18 +131,18 @@ b) unannotated corpus: omsti 2. When everything finishes, do `git checkout 42bc700` 3. Run `sbatch cartesius/exp-variation-score.job` -#### Reproduce optimization experiment +#### Reproduce (training speed) optimization experiment +1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job` 0. `git checkout a74bda6` -1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job` 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job` 3. `git checkout e93fdb2` 4. Run in parallel: `sbatch cartesius/exp-optimization{i}.job` where i=1,2,3,4 #### Data size experiment +1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job` 0. `git checkout a74bda6` -1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job` 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job` 3. `git checkout 4e4a04a` 4. Run `sbatch cartesius/exp-data-size.job {i}` with i="01",10,25,50,75 diff --git a/compile_results.py b/compile_results.py index 71bcd95..528b9c3 100644 --- a/compile_results.py +++ b/compile_results.py @@ -11,6 +11,8 @@ import matplotlib.pyplot as plt from sklearn.linear_model.base import LinearRegression import configs +from configs import SmallConfig, H256P64, LargeConfig, GoogleConfig,\ + DefaultConfig ModelPerformance = namedtuple('ModelPerformance', ['name', 'semcor', 'mun']) @@ -118,17 +120,20 @@ def draw_data_size_vs_performance_chart(): print('Extrapolated data size:') print(lr.predict([[0.75], [0.8]])) +def compute_num_params(vocab_size, p, h): + return (vocab_size*p*2 + # input and output embeddings + p*h + h*h + h + # input gates + p*h + h*h + h + # candidate states + p*h + h*h + h + # forget gates + p*h + h*h + h*h + h + # output gates + p*h # context layer + ) + def draw_capacity_vs_performance_chart(): ''' Create figure for paper ''' df = pd.read_csv('output/capacity_vs_performance.csv') vocab_size = configs.DefaultConfig.vocab_size - df['num_params'] = (vocab_size*df['p']*2 + # input and output embeddings - df['p']*df['h'] + df['h']*df['h'] + df['h'] + # input gates - df['p']*df['h'] + df['h']*df['h'] + df['h'] + # candidate states - df['p']*df['h'] + df['h']*df['h'] + df['h'] + # forget gates - df['p']*df['h'] + df['h']*df['h'] + df['h']*df['h'] + df['h'] + # output gates - df['p']*df['h'] # context layer - ) + df['num_params'] = compute_num_params(vocab_size, df['p'], df['h']) print(df) with PdfPages('output/capacity_vs_performance.pdf') as pdf: semcor_handle, = plt.plot(df['num_params'], df['semcor'], label='SemEval13 (T: SemCor)') @@ -149,8 +154,18 @@ def draw_capacity_vs_performance_chart(): # print('Extrapolated data size:') # print(lr.predict([[0.75], [0.8]])) +def report_model_params(): + v = DefaultConfig.vocab_size + models = [SmallConfig, H256P64, LargeConfig, GoogleConfig] + table = [['%.0fM' %(v/10**6), m.emb_dims, m.hidden_size, + "%.0fM" %(compute_num_params(v, m.emb_dims, m.hidden_size)/10**6)] + for m in models] + df = pd.DataFrame(table, columns=['Vocab.', 'p', 'h', '#params']) + print(df.to_latex(index=False)) + if __name__ == '__main__': # report_wsd_performance_vs_data_size() # variation_experiment() # draw_data_size_vs_performance_chart() - draw_capacity_vs_performance_chart() \ No newline at end of file +# draw_capacity_vs_performance_chart() + report_model_params() \ No newline at end of file diff --git a/configs.py b/configs.py index fecd147..89cd7cf 100644 --- a/configs.py +++ b/configs.py @@ -5,7 +5,6 @@ os.makedirs(output_dir, exist_ok=True) gigaword_path = 'data/gigaword' -preprocessed_gigaword_path = os.path.join('preprocessed-data', 'gigaword.txt') class DefaultConfig(object): vocab_size = 10**6 + 3 diff --git a/das5/prepare-lstm-wsd.job b/das5/prepare-lstm-wsd.job new file mode 100755 index 0000000..0b21127 --- /dev/null +++ b/das5/prepare-lstm-wsd.job @@ -0,0 +1,4 @@ +#!/bin/bash +#SBATCH --time=24:00:00 + +python3 -u prepare-lstm-wsd.py \ No newline at end of file diff --git a/das5/process-gigaword.job b/das5/process-gigaword.job old mode 100644 new mode 100755 index 8827673..b434649 --- a/das5/process-gigaword.job +++ b/das5/process-gigaword.job @@ -2,4 +2,3 @@ #SBATCH --time=24:00:00 python3 -u process-gigaword.py -python3 -u prepare-lstm-wsd.py \ No newline at end of file diff --git a/diary-minh2.md b/diary-minh2.md index 2456893..a097ede 100644 --- a/diary-minh2.md +++ b/diary-minh2.md @@ -260,6 +260,39 @@ job: [minhle@gcn40 wsd-dynamic-sense-vector]$ tail -f output/`python3 version.py`/exp-variation-score.job.out ... +## Thu 7 Dec + +Worked on the paper. Data size 25% experiment has finished. Tried to run the +newest evaluation script on it but no GPU machine is available yet. + + 42bc700..0a0d02b master -> origin/master + First, rewinding head to replay your work on top of it... + Fast-forwarded master to 0a0d02b4538dcf7322742e32e367a90ec1055899. + [minhle@int2 wsd-dynamic-sense-vector]$ sbatch cartesius/eval-data-size.job + Submitted batch job 3820439 + +## Fri 19 Dec + +Meeting with Jacopo+Marten. Jacopo would like to retrain everything with +token. Checked everything again. There doesn't seem to be big difference (that +I don't know of) between the version that produced current reported results +and a more recent version. Let's try. + +Added `` to the preparation script. + +I'll also need to add it to the evaluation scripts. + + >>> from collections import Counter + >>> c = Counter() + >>> with open('preprocessed-data/694cb4d/gigaword.txt') as f: + for sent in f: + c[sent.strip().split()[-1]] += 1 + >>> c.most_common(10) + [('.', 141537114), ("''", 7066432), ('"', 7015844), (')', 2214057), ('_', 1964897), (':', 1605763), ('?', 1486728), ('--', 774285), ("'", 648803), ('...', 434971)] + >>> total = sum(c.values()) + >>> [(tok, cnt/total) for tok, cnt in c.most_common(10)] + [('.', 0.8052320716307731), ("''", 0.04020230113211145), ('"', 0.039914496196088396), (')', 0.012596199360251295), ('_', 0.01117867983270516), (':', 0.00913549690604858), ('?', 0.008458283721904037), ('--', 0.004405057422483782), ("'", 0.00369116600590189), ('...', 0.002474634316970099)] + TODO: docker image @@ -271,8 +304,8 @@ TODO: docker image 5. [x] for 25 Oct: list of all experiments for the reproduction paper 6. [x] save models of every epoch (instead of only the best one) 6. [x] Read more about label propagation (Zhou et al. 2004) -7. [ ] Hyperparameter tuning of label propagation +7. [x] Hyperparameter tuning of label propagation 8. [ ] Training creates a lot of models, how to reduce it? 9. [ ] Send code+data to Jacopo to run -10. [ ] Polish the paper +10. [x] Polish the arxiv paper 11. [x] Use the same dev set for different sizes of the data. \ No newline at end of file diff --git a/difference-edited.txt b/difference-edited.txt new file mode 100644 index 0000000..d39f15b --- /dev/null +++ b/difference-edited.txt @@ -0,0 +1,1719 @@ +************************************************************************************ + +This is a ***edited*** patch file that compare revision 354acc1cfdd542142490afe40447cb6f40d2fd7c (Jul 6, 2017), +which produced our first h=2048,p=512 (code name "google") model, to a more recent +revision, 3a24bb0560b41e435bae5215c5c5556d5542134f (Dec 6, 2017). + +The purpose is to identify any difference that could affect performance, should it +goes down. + +************************************************************************************ + + + + + + + +diff --git a/model.py b/model.py +index 85f627a..b751d0c 100644 +--- a/model.py ++++ b/model.py +@@ -1,15 +1,19 @@ + import numpy as np + import tensorflow as tf ++import time ++import sys ++ ++float_dtype = tf.float32 + + class DummyModelTrain(object): + ''' + This is for testing GPU usage only. This model runs very trivial operations +- on GPU therefore its running time is mostly on CPU. Compared to WSDModelTrain, ++ on GPU therefore its running time is mostly on CPU. Compared to WSDModel, + this model should run much faster, otherwise you're spending too much time + on CPU. + ''' + +- def __init__(self, config, float_dtype): ++ def __init__(self, config): + self._x = tf.placeholder(tf.int32, shape=[None, None], name='x') + self._y = tf.placeholder(tf.int32, shape=[None], name='y') + self._subvocab = tf.placeholder(tf.int32, shape=[None], name='subvocab') +@@ -39,42 +43,77 @@ class DummyModelTrain(object): + def print_device_placement(self): + pass + +-class WSDModelTrain(object): ++class WSDModel(object): + """A LSTM WSD model designed for fast training.""" + ++ def _build_inputs(self): ++ # the names are for later reference when the model is loaded ++ # they might be used or not, doesn't hurt ++ self._lens = tf.placeholder(tf.int32, shape=[None], name='lens') + ++ def _build_word_embeddings(self): + E_words = tf.get_variable("word_embedding", +- [config.vocab_size, config.emb_dims], dtype=float_dtype) +- outputs, _ = tf.nn.dynamic_rnn(cell, word_embs, dtype=float_dtype) ++ [self.config.vocab_size, self.config.emb_dims], dtype=float_dtype) ++ ++ def _build_lstm_output(self): ++ if self.optimized and self.config.assume_same_lengths: ++ outputs, _ = tf.nn.dynamic_rnn(cell, self._word_embs, ++ dtype=float_dtype) ++ self._lstm_output = outputs[:,-1] ++ else: ++ outputs, _ = tf.nn.dynamic_rnn(cell, self._word_embs, ++ sequence_length=self._lens, ++ dtype=float_dtype) ++ last_output_indices = tf.stack([tf.range(tf.shape(self._x)[0]), self._lens-1], axis=1) ++ self._lstm_output = tf.gather_nd(outputs, last_output_indices) ++ self._initial_state = cell.zero_state(tf.shape(self._x)[0], float_dtype) ++ ++ def _build_context_embs(self): + context_layer_weights = tf.get_variable("context_layer_weights", +- [config.hidden_size, config.emb_dims], dtype=float_dtype) +- self._predicted_context_embs = tf.matmul(outputs[:,-1], context_layer_weights, ++ [self.config.hidden_size, self.config.emb_dims], dtype=float_dtype) ++ self._predicted_context_embs = tf.matmul(self._lstm_output, context_layer_weights, + name='predicted_context_embs') ++ ++ def _build_logits(self): + E_contexts = tf.get_variable("context_embedding", +- [config.vocab_size, config.emb_dims], dtype=float_dtype) +- subcontexts = tf.nn.embedding_lookup(E_contexts, self._subvocab) +- pre_probs = tf.matmul(self._predicted_context_embs, tf.transpose(subcontexts)) +- ++ [self.config.vocab_size, self.config.emb_dims], dtype=float_dtype) ++ if self.optimized and self.config.sampled_softmax: ++ subcontexts = tf.nn.embedding_lookup(E_contexts, self._subvocab) ++ self._logits = tf.matmul(self._predicted_context_embs, tf.transpose(subcontexts)) ++ else: ++ self._logits = tf.matmul(self._predicted_context_embs, tf.transpose(E_contexts)) ++ ++ def _build_cost(self): + self._cost = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( +- logits=pre_probs, labels=self._y)) +- ++ logits=self._logits, labels=self._y)) ++ self._hit_at_100 = tf.reduce_mean(tf.cast( ++ tf.nn.in_top_k(self._logits, self._y, 100), float_dtype)) + tvars = tf.trainable_variables() + grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), +- config.max_grad_norm) +- optimizer = tf.train.AdagradOptimizer(config.learning_rate) ++ self.config.max_grad_norm) ++ optimizer = tf.train.AdagradOptimizer(self.config.learning_rate) ++ self._global_step = tf.contrib.framework.get_or_create_global_step() + self._train_op = optimizer.apply_gradients(zip(grads, tvars), +- global_step=tf.contrib.framework.get_or_create_global_step()) +- self._initial_state = cell.zero_state(tf.shape(self._x)[0], float_dtype) +- +- self.run_options = self.run_metadata = None ++ global_step=self._global_step) + + def trace_timeline(self): + self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) +@@ -87,17 +126,20 @@ class WSDModelTrain(object): + + # resample the batches so that each token has equal chance to become target + # another effect is to randomize the order of batches +- sentence_lens = np.array([x.shape[1] for x, _, _, in data]) +- samples = np.random.choice(len(data), size=len(data), +- p=sentence_lens/sentence_lens.sum()) ++ if self.config.optimized_batches: ++ sentence_lens = np.array([x.shape[1] for x, _, _, _ in data]) ++ samples = np.random.choice(len(data), size=len(data), ++ p=sentence_lens/sentence_lens.sum()) ++ else: ++ samples = np.random.choice(len(data), size=len(data)) + for batch_no, batch_id in enumerate(samples): +- x, y_all, subvocab = data[batch_id] ++ x, y_all, subvocab, lens = data[batch_id] + i = np.random.randint(x.shape[1]) + y = y_all[:,i] +- old_xi = x[:,i].copy() ++ old_xi = x[:,i].copy() # old_xi might be different from y because of subvocab + x[:,i] = target_id + +- feed_dict = {self._x: x, self._y: y, self._subvocab: subvocab} ++ feed_dict = {self._x: x, self._y: y, self._subvocab: subvocab, self._lens: lens} + state = session.run(self._initial_state, feed_dict) + c, h = self._initial_state + feed_dict[c] = state.c +@@ -130,64 +172,18 @@ class WSDModelTrain(object): + sess.run(self._train_op, feed_dict) + print("******** End of device placement ********") + + + + + + +diff --git a/perform_wsd.py b/perform_wsd.py +index 6e14dc4..2a7a027 100644 +--- a/perform_wsd.py ++++ b/perform_wsd.py +@@ -1,21 +1,85 @@ + import numpy as np ++import os + import tensorflow as tf ++import json + import argparse + import pickle + import pandas + from nltk.corpus import wordnet as wn ++from nltk.corpus.reader.wordnet import WordNetCorpusReader + from scipy import spatial ++import morpho_utils ++import tensor_utils as utils + + parser = argparse.ArgumentParser(description='Perform WSD using LSTM model') + parser.add_argument('-m', dest='model_path', required=True, help='path to model trained LSTM model') +-# model_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/lstm-wsd-small' + parser.add_argument('-v', dest='vocab_path', required=True, help='path to LSTM vocabulary') +-# vocab_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/gigaword.1m-sents-lstm-wsd.index.pkl' + parser.add_argument('-c', dest='wsd_df_path', required=True, help='input path to dataframe wsd competition') ++parser.add_argument('-l', dest='log_path', required=True, help='path where exp settings are stored') + parser.add_argument('-s', dest='sense_embeddings_path', required=True, help='path where sense embeddings are stored') + parser.add_argument('-o', dest='output_path', required=True, help='path where output wsd will be stored') ++parser.add_argument('-r', dest='results', required=True, help='path where accuracy will be reported') ++parser.add_argument('-g', dest='gran', required=True, help='sensekey | synset') ++parser.add_argument('-f', dest='mfs_fallback', required=True, help='True or False') ++parser.add_argument('-t', dest='path_case_freq', help='path to pickle with case freq') ++parser.add_argument('-a', dest='use_case_strategy', help='set to True to use morphological strategy case') ++parser.add_argument('-p', dest='path_plural_freq', help='path to pickle with plural freq') ++parser.add_argument('-b', dest='use_number_strategy', help='set to True to use morphological strategy number') ++parser.add_argument('-y', dest='path_lp', help='path to lp output') ++parser.add_argument('-z', dest='use_lp', help='set to True to use label propagation') ++ ++ + args = parser.parse_args() ++args.mfs_fallback = args.mfs_fallback == 'True' ++case_strategy = args.use_case_strategy == 'True' ++number_strategy = args.use_number_strategy == 'True' ++lp_strategy = args.use_lp == 'True' ++ ++case_freq = pickle.load(open(args.path_case_freq, 'rb')) ++plural_freq = pickle.load(open(args.path_plural_freq, 'rb')) ++lp_info = dict() ++ ++the_wn_version = '30' ++# load relevant wordnet ++if '171' in args.wsd_df_path: ++ the_wn_version = '171' ++ cwd = os.path.dirname(os.path.realpath(__file__)) ++ path_to_wn_dict_folder = os.path.join(cwd, 'scripts', 'wordnets', '171', 'WordNet-1.7.1', 'dict') ++ wn = WordNetCorpusReader(path_to_wn_dict_folder, None) ++ ++ ++with open(args.sense_embeddings_path + '.freq', 'rb') as infile: ++ meaning_freqs = pickle.load(infile) ++ ++with open(args.log_path, 'w') as outfile: ++ json.dump(args.__dict__, outfile) ++ ++ ++def lp_output(row, lp_info, candidate_synsets, debug=False): ++ target_lemma = row['target_lemma'] ++ target_pos = row['pos'] + ++ key = (target_lemma, target_pos) ++ ++ if key not in lp_info: ++ if debug: ++ print(target_lemma, target_pos, 'not in lp_info') ++ return None ++ ++ lp_index = row['lp_index'] ++ if lp_index is None: ++ print('lp_index is None') ++ return None ++ ++ sensekey = lp_info[(target_lemma, target_pos)][lp_index] ++ synset_identifier = None ++ ++ for synset in candidate_synsets: ++ if any([lemma.key() == sensekey ++ for lemma in synset.lemmas()]): ++ synset_identifier = synset2identifier(synset, '30') ++ ++ return synset_identifier + + def synset2identifier(synset, wn_version): + """ +@@ -33,7 +97,7 @@ def synset2identifier(synset, wn_version): + offset_8_char = offset.zfill(8) + + pos = synset.pos() +- if pos == 'j': ++ if pos in {'s', 'j'}: + pos = 'a' + + identifier = 'eng-{wn_version}-{offset_8_char}-{pos}'.format_map(locals()) +@@ -64,14 +128,14 @@ def extract_sentence_wsd_competition(row): + sentence_tokens.append(sentence_token.text) + + assert len(sentence_tokens) >= 2 +- assert pos is not None +- assert lemma is not None +- assert target_index is not None ++ #assert pos is not None # only needed for sem2013-aw ++ #assert lemma is not None, (lemma, pos) ++ #assert target_index is not None + + return target_index, sentence_tokens, lemma, pos + + +-def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos): ++def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos, gran, synset2higher_level): + """ + perform wsd + +@@ -85,30 +149,46 @@ def score_synsets(target_embedding, candidate_synsets, sense_embeddings, instanc + """ + highest_synsets = [] + highest_conf = 0.0 ++ candidate_freq = dict() ++ strategy = 'lstm' ++ ++ for synset in candidate_synsets: ++ if gran == 'synset': ++ candidate = synset ++ candidate_freq[synset] = meaning_freqs[candidate] ++ elif gran in {'sensekey', 'blc20', 'direct_hypernym'}: ++ candidate = None ++ if synset in synset2higher_level: ++ candidate = synset2higher_level[synset] ++ candidate_freq[synset] = meaning_freqs[candidate] ++ candidate_freq[synset] = meaning_freqs[candidate] + +- for candidate in candidate_synsets: + if candidate not in sense_embeddings: +- print('%s %s %s: candidate %s missing in sense embeddings' % (instance_id, lemma, pos, candidate)) ++ #print('%s %s %s: candidate %s missing in sense embeddings' % (instance_id, lemma, pos, candidate)) + continue + + cand_embedding = sense_embeddings[candidate] + sim = 1 - spatial.distance.cosine(cand_embedding, target_embedding) + + if sim == highest_conf: +- highest_synsets.append(candidate) ++ highest_synsets.append(synset) + elif sim > highest_conf: +- highest_synsets = [candidate] ++ highest_synsets = [synset] + highest_conf = sim + + if len(highest_synsets) == 1: + highest_synset = highest_synsets[0] + elif len(highest_synsets) >= 2: + highest_synset = highest_synsets[0] +- print('%s %s %s: 2> synsets with same conf %s: %s' % (instance_id, lemma, pos, highest_conf, highest_synsets)) ++ #print('%s %s %s: 2> synsets with same conf %s: %s' % (instance_id, lemma, pos, highest_conf, highest_synsets)) + else: +- highest_synset = None +- print('%s: no highest synset' % instance_id) +- return highest_synset ++ if args.mfs_fallback: ++ highest_synset = candidate_synsets[0] ++ #print('%s: no highest synset -> mfs' % instance_id) ++ strategy = 'mfs_fallback' ++ else: ++ highest_synset = None ++ return highest_synset, candidate_freq, strategy + + + # load wsd competition dataframe +@@ -117,7 +197,11 @@ wsd_df = pandas.read_pickle(args.wsd_df_path) + # add output column + wsd_df['lstm_output'] = [None for _ in range(len(wsd_df))] + wsd_df['lstm_acc'] = [None for _ in range(len(wsd_df))] +- ++wsd_df['emb_freq'] = [None for _ in range(len(wsd_df))] ++wsd_df['#_cand_synsets'] = [None for _ in range(len(wsd_df))] ++wsd_df['#_new_cand_synsets'] = [None for _ in range(len(wsd_df))] ++wsd_df['gold_in_new_cand_synsets'] = [None for _ in range(len(wsd_df))] ++wsd_df['wsd_strategy'] = [None for _ in range(len(wsd_df))] + + # load sense embeddings + with open(args.sense_embeddings_path, 'rb') as infile: +@@ -130,8 +214,9 @@ vocab = np.load(args.vocab_path) + with tf.Session() as sess: # your session object + saver = tf.train.import_meta_graph(args.model_path + '.meta', clear_devices=True) + saver.restore(sess, args.model_path) +- predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0') +- x = sess.graph.get_tensor_by_name('Model/x:0') ++ x, predicted_context_embs, lens = utils.load_tensors(sess) ++ #predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0') ++ #x = sess.graph.get_tensor_by_name('Model/Placeholder:0') + + for row_index, row in wsd_df.iterrows(): + target_index, sentence_tokens, lemma, pos = extract_sentence_wsd_competition(row) +@@ -139,25 +224,100 @@ with tf.Session() as sess: # your session object + target_id = vocab[''] + sentence_as_ids = [vocab.get(w) or vocab[''] for w in sentence_tokens] + sentence_as_ids[target_index] = target_id +- target_embedding = sess.run(predicted_context_embs, {x: [sentence_as_ids]})[0] + +- # load candidate synsets +- synsets = wn.synsets(lemma, pos=pos) +- candidate_synsets = {synset2identifier(synset, wn_version='30') +- for synset in synsets} ++ target_embeddings = sess.run(predicted_context_embs, {x: [sentence_as_ids], ++ lens: [len(sentence_as_ids)]}) ++ for target_embedding in target_embeddings: ++ break ++ ++ #target_embedding = sess.run(predicted_context_embs, {x: [sentence_as_ids]})[0] ++ ++ # load token object ++ token_obj = row['tokens'][0] ++ ++ # morphology reduced polysemy ++ pos = row['pos'] ++ if the_wn_version in {'171'}: ++ pos = None ++ ++ ++ candidate_synsets, \ ++ new_candidate_synsets, \ ++ gold_in_candidates = morpho_utils.candidate_selection(wn, ++ token=token_obj.text, ++ target_lemma=row['target_lemma'], ++ pos=row['pos'], ++ morphofeat=token_obj.morphofeat, ++ use_case=case_strategy, ++ use_number=number_strategy, ++ gold_lexkeys=row['lexkeys'], ++ case_freq=case_freq, ++ plural_freq=plural_freq, ++ debug=False) ++ ++ the_chosen_candidates = [synset2identifier(synset, wn_version=the_wn_version) ++ for synset in new_candidate_synsets] ++ ++ print() ++ print(the_chosen_candidates, gold_in_candidates) ++ # get mapping to higher abstraction level ++ synset2higher_level = dict() ++ if args.gran in {'sensekey', 'blc20', 'direct_hypernym'}: ++ label = 'synset2%s' % args.gran ++ synset2higher_level = row[label] ++ ++ # determine wsd strategy used ++ if len(candidate_synsets) == 1: ++ wsd_strategy = 'monosemous' ++ elif len(new_candidate_synsets) == 1: ++ wsd_strategy = 'morphology_solved' ++ elif len(candidate_synsets) == len(new_candidate_synsets): ++ wsd_strategy = 'lstm' ++ elif len(new_candidate_synsets) < len(candidate_synsets): ++ wsd_strategy = 'morphology+lstm' ++ ++ # possibly include label propagation strategy ++ if lp_strategy: ++ lp_result = lp_output(row, lp_info, new_candidate_synsets, debug=False) ++ ++ if lp_result: ++ the_chosen_candidates = [lp_result] ++ wsd_strategy = 'lp' + + # perform wsd +- if len(candidate_synsets) >= 2: +- chosen_synset = score_synsets(target_embedding, candidate_synsets, sense_embeddings, instance_id, lemma, pos) ++ if len(the_chosen_candidates) >= 2: ++ chosen_synset, \ ++ candidate_freq, \ ++ strategy = score_synsets(target_embedding, ++ the_chosen_candidates, ++ sense_embeddings, ++ instance_id, ++ lemma, ++ pos, ++ args.gran, ++ synset2higher_level) ++ ++ #if strategy == 'mfs_fallback': ++ # wsd_strategy = 'mfs_fallback' ++ + else: +- chosen_synset = candidate_synsets.pop() ++ chosen_synset = None ++ if the_chosen_candidates: ++ chosen_synset = the_chosen_candidates[0] ++ candidate_freq = dict() + + # add to dataframe + wsd_df.set_value(row_index, col='lstm_output', value=chosen_synset) ++ wsd_df.set_value(row_index, col='#_cand_synsets', value=len(candidate_synsets)) ++ wsd_df.set_value(row_index, col='#_new_cand_synsets', value=len(new_candidate_synsets)) ++ wsd_df.set_value(row_index, col='gold_in_new_cand_synsets', value=gold_in_candidates) ++ wsd_df.set_value(row_index, col='wsd_strategy', value=wsd_strategy) + + # score it +- lstm_acc = chosen_synset in row['wn30_engs'] ++ print(chosen_synset, row['source_wn_engs']) ++ lstm_acc = chosen_synset in row['source_wn_engs'] # used to be wn30_engs + wsd_df.set_value(row_index, col='lstm_acc', value=lstm_acc) ++ wsd_df.set_value(row_index, col='emb_freq', value=candidate_freq) + + if lstm_acc: + num_correct += 1 +@@ -167,6 +327,9 @@ print(num_correct) + # save it + wsd_df.to_pickle(args.output_path) + ++with open(args.results, 'w') as outfile: ++ outfile.write('%s' % num_correct) ++ + + + +diff --git a/prepare-lstm-wsd.py b/prepare-lstm-wsd.py +index f3e4d1b..52db01c 100644 +--- a/prepare-lstm-wsd.py ++++ b/prepare-lstm-wsd.py +@@ -7,6 +7,7 @@ Read a simple text file (one sentence per line) and produce these files: + - .train.npz: training batches (each batch contains roughly the same + number of tokens but differing number of sentences depends on sentence length) + - .dev.npz: development dataset (as big as one epoch) ++- + + @author: Minh Le + ''' +@@ -20,23 +21,25 @@ import pickle + import re + import numpy as np + import subprocess +-from tensorflow.contrib.labeled_tensor import batch ++from random import Random ++from collections import Counter ++from utils import progress, count_lines_fast ++from configs import preprocessed_gigaword_path, output_dir ++from version import version + + dev_sents = 20000 # absolute maximum + dev_portion = 0.01 # relative maximum +-batch_size = 128000 # words ++# if you get OOM (out of memory) error, reduce this number ++batch_size = 60000 # words + vocab_size = 10**6 + min_count = 5 + +-special_symbols = ['', '', ''] ++inp_path = preprocessed_gigaword_path ++# inp_path = 'preprocessed-data/gigaword_1m-sents.txt' # for debugging ++out_dir = os.path.join('preprocessed-data', version) ++out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd') + +-def progress(it): +- start = time() +- for i, val in enumerate(it): +- yield(val) +- if (i+1) % 1000000 == 0: +- sys.stderr.write('processed %d items, elapsed time: %.1f minutes...\n' +- %(i+1, (time()-start)/60)) ++special_symbols = ['', '', ''] + + def _build_vocab(filename): + sys.stderr.write('Building vocabulary...\n') +@@ -55,87 +58,146 @@ def _build_vocab(filename): + return word2id, words + + def sort_sentences(inp_path, out_path): ++ start = time() + cmd = ('cat %s | python3 scripts/sentlen.py --min 6 --max 100 ' +- '| sort -T output -k1,1g -k2 | uniq > %s' +- %(inp_path, out_path)) ++ '| sort -T %s -k1,1g -k2 | uniq > %s' ++ %(inp_path, output_dir, out_path)) + sys.stderr.write('%s\n' %cmd) + status = subprocess.call(cmd, shell=True) ++ sys.stderr.write('sorting finished after %.1f minutes...\n' %((time()-start)/60)) + assert status == 0 + +-def lookup_and_iter_sents(filename, word_to_id): ++def lookup_and_iter_sents(filename, word2id, include_ids=None, exclude_ids=None): + unkn_id = word2id[''] + with codecs.open(filename, 'r', 'utf-8') as f: +- for line in f: +- words = line.strip().split() +- yield [word_to_id.get(word) or unkn_id for word in words] ++ for sent_id, line in enumerate(f): ++ if ((include_ids is None or sent_id in include_ids) and ++ (exclude_ids is None or sent_id not in exclude_ids)): ++ words = line.strip().split() ++ yield [word2id.get(word) or unkn_id for word in words] + +-class PadFunc(object): +- +- dry_run=False +- +- def __init__(self): +- self.total = 0 +- self.pads = 0 +- def __call__(self, sents, max_len, pad_id): +- if self.dry_run: +- arr = np.empty(0) +- value_count = sum(1 for s in sents for _ in s) +- size = len(sents) * max_len +- else: +- arr = np.zeros((len(sents), max_len), dtype=np.int32) +- size = arr.size +- arr.fill(pad_id) +- value_count = 0 +- for i, s in enumerate(sents): +- for j, v in enumerate(s): +- arr[i,j] = v +- value_count += 1 +- self.pads += (size - value_count) +- self.total += size +- return arr +- +-def pad_batches(inp_path, word2id): ++def pad(sents, max_len, pad_id): ++ arr = np.empty((len(sents), max_len), dtype=np.int32) ++ arr.fill(pad_id) ++ for i, s in enumerate(sents): ++ arr[i, :len(s)] = s ++ return arr ++ ++def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1): + sys.stderr.write('Dividing and padding...\n') +- pad = PadFunc() + pad_id = word2id[''] +- dev = [] + batches = {} +- last_max_len = 0 +- last_batch = [] +- with open(inp_path) as f: total_sents = sum(1 for line in f) +- for sent in progress(lookup_and_iter_sents(inp_path, word2id)): +- if (len(dev) < dev_sents and len(dev) < dev_portion*total_sents +- and np.random.rand() < 0.01): +- dev.append(sent) +- else: +- last_max_len = max(last_max_len, len(sent)) +- last_batch.append(sent) +- if len(last_batch)*last_max_len >= batch_size: +- batches['batch%d' %len(batches)] = pad(last_batch, last_max_len, pad_id) +- last_max_len = 0 +- last_batch = [] +- if last_max_len > 0: +- batches['batch%d' %len(batches)] = pad(last_batch, last_max_len, pad_id) +- dev_lens = np.array([len(s) for s in dev], dtype=np.int32) +- dev_padded = PadFunc()(dev, max(dev_lens), pad_id) ++ sent_lens = [] ++ curr_max_len = 0 ++ curr_batch = [] ++ batch_id = 0 ++ for sent in progress(lookup_and_iter_sents(inp_path, word2id, ++ include_ids, exclude_ids)): ++ new_size = (len(curr_batch)+1) * max(curr_max_len,len(sent)) ++ if new_size > batch_size or (max_sents > 0 and len(curr_batch) >= max_sents): ++ batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id) ++ batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32) ++ batch_id += 1 ++ curr_max_len = 0 ++ curr_batch = [] ++ curr_max_len = max(curr_max_len, len(sent)) ++ curr_batch.append(sent) ++ sent_lens.append(len(sent)) ++ if curr_batch: ++ batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id) ++ batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32) ++ batch_id += 1 # important to count num batches correctly ++ sent_lens = np.array(sent_lens, dtype=np.int32) + sys.stderr.write('Dividing and padding... Done.\n') +- sizes = np.array([b.size for b in batches.values()]) +- if len(batches) >= 2: ++ sizes = np.array([batches['batch%d'%i].size for i in range(batch_id)]) ++ if batch_id >= 2: + sys.stderr.write('Divided into %d batches (%d elements each, std=%d, ' + 'except last batch of %d).\n' +- %(len(batches), sizes[:-1].mean(), sizes[:-1].std(), sizes[-1])) ++ %(batch_id, sizes[:-1].mean(), sizes[:-1].std(), sizes[-1])) + else: +- assert len(batches) == 1 ++ assert batch_id == 1 + sys.stderr.write('Created 1 batch of %d elements.\n' %sizes[0]) +- sys.stderr.write('Added %d elements as padding (%.2f%%).\n' +- %(pad.pads, pad.pads*100.0/pad.total)) +- sys.stderr.write('Consumed roughly %.2f GiB.\n' +- %(pad.total*4/float(2**30))) +- return batches, dev_padded, dev_lens ++ sys.stderr.write('Sentence lengths: %.5f (std=%.5f)\n' ++ %(sent_lens.mean(), sent_lens.std())) ++ return batches + +-if __name__ == '__main__': +- inp_path, out_path = sys.argv[1:] ++ ++def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids): ++ sys.stderr.write('Reading lengths...\n') ++ lens = [] ++ with codecs.open(inp_path, 'r', 'utf-8') as f: ++ for line in progress(f, label='sentences'): ++ # this is different from counting the blank spaces because some words ++ # are separated by double spaces and there might be an additional ++ # whitespace at the end of a line ++ lens.append(len(line.strip().split())) ++ lens = np.array(lens, dtype=np.int32) ++ sys.stderr.write('Reading lengths... Done.\n') ++ ++ sys.stderr.write('Calculating batch shapes...\n') ++ indices = list(range(len(lens))) ++ rng = Random(29) ++ rng.shuffle(indices) ++ total_sents = len(lens) ++ batches = {} ++ curr_max_len = 0 ++ curr_batch_lens = [] ++ sent2batch = {} ++ batch_id = 0 ++ for sent_id in progress(indices, label='sentences'): ++ l = lens[sent_id] ++ if sent_id not in dev_sent_ids: ++ new_size = (len(curr_batch_lens)+1) * max(curr_max_len,l) ++ if new_size >= batch_size: ++ batches['batch%d' %batch_id] = \ ++ np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32) ++ batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32) ++ batch_id += 1 ++ curr_max_len = 0 ++ curr_batch_lens = [] ++ curr_max_len = max(curr_max_len, l) ++ curr_batch_lens.append(l) ++ sent2batch[sent_id] = 'batch%d' %batch_id ++ if curr_batch_lens: ++ batches['batch%d' %batch_id] = \ ++ np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32) ++ batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32) ++ batch_id += 1 # important to count num batches correctly ++ sys.stderr.write('Calculating batch shapes... Done.\n') + ++ sys.stderr.write('Dividing and padding...\n') ++ pad_id = word2id[''] ++ for i in range(batch_id): batches['batch%d'%i].fill(pad_id) ++ nonpad_count = 0 ++ sent_counter = Counter() ++ for sent_id, sent in progress(enumerate(lookup_and_iter_sents(inp_path, word2id)), label='sentences'): ++ assert lens[sent_id] == len(sent) ++ batch_name = sent2batch.get(sent_id) ++ if batch_name is not None: # could be in dev set ++ batches[batch_name][sent_counter[batch_name],:len(sent)] = sent ++ nonpad_count += len(sent) ++ sent_counter[batch_name] += 1 ++ # check that we filled all arrays ++ for batch_name in sent_counter: ++ assert sent_counter[batch_name] == batches[batch_name].shape[0] ++ sys.stderr.write('Dividing and padding... Done.\n') ++ ++ sizes = np.array([batches['batch%d'%i].size for i in range(batch_id)]) ++ if batch_id >= 2: ++ sys.stderr.write('Divided into %d batches (%d elements each, std=%d, ' ++ 'except last batch of %d).\n' ++ %(batch_id, sizes[:-1].mean(), sizes[:-1].std(), sizes[-1])) ++ else: ++ assert batch_id == 1 ++ sys.stderr.write('Created 1 batch of %d elements.\n' %sizes[0]) ++ total = sum(sizes) ++ pad_count = total - nonpad_count ++ sys.stderr.write('Sentence lengths: %.5f (std=%.5f)\n' ++ %(lens.mean(), lens.std())) ++ return batches ++ ++def run(): ++ os.makedirs(out_dir, exist_ok=True) + index_path = out_path + '.index.pkl' + if os.path.exists(index_path): + sys.stderr.write('Reading vocabulary from %s... ' %index_path) +@@ -146,17 +208,43 @@ if __name__ == '__main__': + word2id, words = _build_vocab(inp_path) + with open(index_path, 'wb') as f: pickle.dump(word2id, f) + +- sorted_sents_path = inp_path + '.sorted' ++ sorted_sents_path = out_path + '.sorted' + if os.path.exists(sorted_sents_path): + sys.stderr.write('Sentences are already sorted at %s\n' %sorted_sents_path) + else: + sort_sentences(inp_path, sorted_sents_path) ++ ++ total_sents = count_lines_fast(sorted_sents_path) ++ real_num_dev_sents = int(min(dev_sents, dev_portion*total_sents)) ++ np.random.seed(918) ++ dev_sent_ids = set(np.random.choice(total_sents, size=real_num_dev_sents, replace=False)) + + train_path = out_path + '.train.npz' + dev_path = out_path + '.dev.npz' +- if os.path.exists(train_path): +- sys.stderr.write('Result already exists: %s. Skipped.\n' %train_path) ++ shuffled_train_path = out_path + '-shuffled.train.npz' ++ if os.path.exists(shuffled_train_path): ++ sys.stderr.write('Result already exists: %s. Skipped.\n' %shuffled_train_path) + else: +- batches, dev_data, dev_lens = pad_batches(sorted_sents_path, word2id) ++ print("- Training set:") ++ batches = pad_batches(sorted_sents_path, word2id, None, dev_sent_ids) + np.savez(train_path, **batches) +- np.savez(dev_path, data=dev_data, lens=dev_lens) ++ print("- Development set:") ++ batches = pad_batches(sorted_sents_path, word2id, dev_sent_ids, None, 768) ++ np.savez(dev_path, **batches) ++ print("- Shuffled training set:") ++ batches = shuffle_and_pad_batches(sorted_sents_path, word2id, dev_sent_ids) ++ np.savez(shuffled_train_path, **batches) ++ ++ for percent in (1, 10, 25, 50, 75): ++ num_lines = int(percent / 100.0 * total_sents) ++ sampled_ids = set(np.random.choice(total_sents, size=num_lines, replace=False)) ++ pc_train_path = out_path + ('_%02d-pc.train.npz' %percent) ++ if os.path.exists(pc_train_path): ++ sys.stderr.write('%02d%% dataset already exists: %s. Skipped.\n' %pc_train_path) ++ else: ++ print("- Reduced training set (%02d%%):" %percent) ++ batches = pad_batches(sorted_sents_path, word2id, sampled_ids, dev_sent_ids) ++ np.savez(pc_train_path, **batches) ++ ++if __name__ == '__main__': ++ run() +diff --git a/process-gigaword.py b/process-gigaword.py +index e568001..7bf2b0d 100644 +--- a/process-gigaword.py ++++ b/process-gigaword.py +@@ -2,9 +2,17 @@ import os + import gzip + from bs4 import BeautifulSoup + import spacy +-nlp = spacy.load('en_default') ++from configs import gigaword_path, preprocessed_gigaword_path ++import codecs ++from utils import progress ++from version import version + import sys + ++def custom_pipeline(nlp): ++ return (nlp.tagger, nlp.parser) ++ ++nlp = spacy.load('en_default', create_pipeline=custom_pipeline) ++ + def iter_paragraphs(paths): + for path in paths: + with gzip.open(path) as f: +@@ -13,7 +21,6 @@ def iter_paragraphs(paths): + paras = soup.find_all('p') + for p in paras: yield p.text.strip() + +- + def iter_files(root_dir): + for root, dirs, files in os.walk(root_dir): + for fname in files: +@@ -21,21 +28,24 @@ def iter_files(root_dir): + yield os.path.join(root, fname) + + def iter_sents(paragraphs): +- for i, doc in enumerate(nlp.pipe(paragraphs, batch_size=10000, n_threads=32)): +- assert isinstance(doc, spacy.tokens.doc.Doc) and doc.is_parsed ++ for doc in nlp.pipe(paragraphs, batch_size=10000): + for sent in doc.sents: + yield [str(tok).strip() for tok in sent] +- if (i+1) % 10000 == 0: +- sys.stderr.write('%10d' %(i+1)) +- if (i+1) % 100000 == 0: +- sys.stderr.write('\n') + +-gigaword_path = 'data/gigaword' +-example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz' ++ ++# example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz' + + if __name__ == '__main__': +- for sent in iter_sents(iter_paragraphs(iter_files(gigaword_path))): +- for tok in sent: +- sys.stdout.write(tok) +- sys.stdout.write(' ') +- sys.stdout.write('\n') ++ dir_ = os.path.join('preprocessed-data', version) ++ os.makedirs(dir_, exist_ok=True) ++ preprocessed_gigaword_path = os.path.join(dir_, 'gigaword.txt') ++ sys.stderr.write('Writing to %s\n' %preprocessed_gigaword_path) ++ with codecs.open(preprocessed_gigaword_path, 'w', 'utf-8') as f: ++ paths = list(iter_files(gigaword_path)) ++ paths.sort() # remove difference between machines ++ paths = progress(paths, ticks=1, label='files', max_=len(paths)) ++ for sent in iter_sents(iter_paragraphs(paths)): ++ for tok in sent: ++ f.write(tok) ++ f.write(' ') ++ f.write('\n') +diff --git a/scripts/semcor_format2LSTM_input.py b/scripts/semcor_format2LSTM_input.py +index dfc369f..3990f55 100644 +--- a/scripts/semcor_format2LSTM_input.py ++++ b/scripts/semcor_format2LSTM_input.py +@@ -4,7 +4,7 @@ from nltk.corpus import wordnet as wn + from lxml import html, etree + from collections import defaultdict + import wn_utils +- ++from datetime import datetime + + def get_lemma_pos_of_sensekey(sense_key): + """ +@@ -77,6 +77,7 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False): + :return: instance_id -> offset + """ + instance_id2offset = dict() ++ instance_id2sensekeys = dict() + + more_than_one_offset = 0 + no_offsets = 0 +@@ -85,6 +86,8 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False): + for line in infile: + instance_id, *sensekeys = line.strip().split() + ++ instance_id2sensekeys[instance_id] = sensekeys ++ + offsets = {sensekey2offset[sensekey] + for sensekey in sensekeys + if sensekey in sensekey2offset} +@@ -104,18 +107,26 @@ def load_instance_id2offset(mapping_path, sensekey2offset, debug=False): + no_offsets += 1 + + +- return instance_id2offset ++ return instance_id2offset, instance_id2sensekeys + + + # experiment settings + wn_version = '30' + corpora_to_include = ['semcor', +- #'mun' ++ #'mun' + ] # semcor | mun + + accepted_pos = {'NOUN'} + entailment_setting = 'any_hdn' # lemma_hdn | any_hdn +-lemma2annotations = defaultdict(dict) ++#lemma2annotations = defaultdict(dict) ++ ++ ++ ++#path_wn20_to_wn30 = '/Users/marten/Downloads/mappings-upc-2007/mapping-20-30/wn20-30.noun' ++#path_wn20_to_domain = '/Users/marten/git/semantic_class_manager/resources/wn-domains-3.2/wn-domains-3.2-20070223' ++#wn30_domain, domain_wn30 = wn_utils.get_synset2domain(path_wn20_to_domain, ++# path_wn20_to_wn30) ++ + + if wn_version == '30': + path_to_wn_dict_folder = str(wn._get_root()) # change this for other wn versions +@@ -129,12 +140,15 @@ elif corpora_to_include == ['semcor']: + input_xml_path = '../data/WSD_Training_Corpora/SemCor/semcor.data.xml' + input_mapping_path = '../data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt' + ++sensekey_output_path = 'sensekey-' + '_'.join(corpora_to_include) + '.txt' + synset_output_path = 'synset-' + '_'.join(corpora_to_include) + '.txt' + hdn_output_path = '-'.join(['hdn', + '_'.join(corpora_to_include), + '_'.join(accepted_pos), + entailment_setting]) + '.txt' + ++#domain_output_path = 'domain-' + '_'.join(corpora_to_include) + '.txt' ++#domain_mapping_path = domain_output_path + '.mapping' + + # precompute all hdns + lemma_pos2offsets = wn_utils.load_lemma_pos2offsets(path_to_wn_index_sense) +@@ -172,14 +186,18 @@ my_wn_reader = WordNetCorpusReader(path_to_wn_dict_folder, None) + sensekey2offset = load_mapping_sensekey2offset(path_to_wn_index_sense, + wn_version) + +-instance_id2offset = load_instance_id2offset(input_mapping_path, +- sensekey2offset, +- debug=False) ++instance_id2offset, instance_id2sensekeys = load_instance_id2offset(input_mapping_path, ++ sensekey2offset, ++ debug=False) + + my_html_tree = html.parse(input_xml_path) + +-hdn_outfile = open(hdn_output_path, 'w') ++sensekey_outfile = open(sensekey_output_path, 'w') + synset_outfile = open(synset_output_path, 'w') ++#domain_outfile = open(domain_output_path, 'w') ++hdn_outfile = open(hdn_output_path, 'w') ++ ++domain2freq = defaultdict(int) + + for corpus_node in my_html_tree.xpath('body/corpus'): + +@@ -191,11 +209,16 @@ for corpus_node in my_html_tree.xpath('body/corpus'): + for sent_node in corpus_node.xpath('text/sentence'): + + sentence_tokens = [] ++ sensekey_annotations = [] + synset_annotations = [] + hdn_annotations = [] ++ domain_annotations = [] + + for child_el in sent_node.getchildren(): + ++ if child_el.sourceline % 10000 == 0: ++ print(child_el.sourceline, datetime.now()) ++ + lemma = child_el.get('lemma') + token = child_el.text + pos = child_el.get('pos') +@@ -204,8 +227,11 @@ for corpus_node in my_html_tree.xpath('body/corpus'): + assert token is not None + + sentence_tokens.append(token) ++ ++ sent_sensekey_annotations = [] + sent_synset_annotations = [] + sent_hdn_annotations = [] ++ sent_domain_annotations = [] + + if all([child_el.tag == 'instance', + pos in accepted_pos]): +@@ -214,13 +240,20 @@ for corpus_node in my_html_tree.xpath('body/corpus'): + synset_id = instance_id2offset[instance_id] + + # update counter for logging purposes +- if synset_id not in lemma2annotations[lemma]: +- lemma2annotations[lemma][synset_id] = {'hdn': 0, 'synset': 0} ++ #if synset_id not in lemma2annotations[lemma]: ++ # lemma2annotations[lemma][synset_id] = {'hdn': 0, 'synset': 0} ++ #lemma2annotations[lemma][synset_id]['synset'] += 1 + +- lemma2annotations[lemma][synset_id]['synset'] += 1 ++ sent_synset_annotations.append(synset_id) + ++ sensekeys = instance_id2sensekeys[instance_id] ++ for sensekey in sensekeys: ++ sent_sensekey_annotations.append(sensekey) + +- sent_synset_annotations.append(synset_id) ++ #if synset_id in wn30_domain: ++ # domain = wn30_domain[synset_id] ++ # domain2freq[domain] += 1 ++ # sent_domain_annotations.append(domain) + + # option lemma-based hdn + if entailment_setting == 'lemma_hdn': +@@ -234,21 +267,26 @@ for corpus_node in my_html_tree.xpath('body/corpus'): + hdn = graph_info[synset_id]['under_lcs'] + + if hdn is not None: +- sent_hdn_annotations.append(hdn) +- +- lemma2annotations[lemma][synset_id]['hdn'] += 1 ++ sent_hdn_annotations.append('%s__%s' % (synset_id, hdn)) ++ #lemma2annotations[lemma][synset_id]['hdn'] += 1 + + + elif entailment_setting == 'any_hdn': + hypernyms = sy_id2hypernyms[synset_id] + for hypernym in hypernyms: + if hypernym in all_hdns: +- sent_hdn_annotations.append(hypernym) +- +- lemma2annotations[lemma][synset_id]['hdn'] += 1 ++ sent_hdn_annotations.append('%s_%s' % (synset_id, hypernym)) ++ #lemma2annotations[lemma][synset_id]['hdn'] += 1 + ++ sensekey_annotations.append(sent_sensekey_annotations) + synset_annotations.append(sent_synset_annotations) + hdn_annotations.append(sent_hdn_annotations) ++ #domain_annotations.append(sent_domain_annotations) ++ ++ ++ for sensekey_sentence in wn_utils.generate_training_instances(sentence_tokens, ++ sensekey_annotations): ++ sensekey_outfile.write(sensekey_sentence + '\n') + + for synset_sentence in wn_utils.generate_training_instances(sentence_tokens, + synset_annotations): +@@ -258,28 +296,39 @@ for corpus_node in my_html_tree.xpath('body/corpus'): + hdn_annotations): + hdn_outfile.write(hdn_sentence + '\n') + +-hdn_outfile.close() ++ #for domain_sentence in wn_utils.generate_training_instances(sentence_tokens, ++ # domain_annotations): ++ # domain_outfile.write(domain_sentence + '\n') ++ ++ ++sensekey_outfile.close() + synset_outfile.close() ++hdn_outfile.close() ++#domain_outfile.close() + + + per_lemma = [] +-per_synset = [] ++per_sensekey = [] + per_hdn = [] ++synset2freq = defaultdict(int) + meanings = set() + +-for lemma, info in lemma2annotations.items(): +- lemma_count = 0 +- for sy_id, sy_info in info.items(): +- lemma_count += sy_info['synset'] +- per_synset.append(sy_info['synset']) +- per_hdn.append(sy_info['hdn']) +- +- meanings.add(sy_id) +- +- per_lemma.append(lemma_count) +- +-print('number of unique lemmas: %s' % len(lemma2annotations)) +-print('number of unique meanings: %s' % len(meanings)) +-print('min avg max lemma', min(per_lemma), round(sum(per_lemma) / len(per_lemma), 2), max(per_lemma)) +-print('min avg max synset', min(per_synset), round(sum(per_synset) / len(per_synset), 2), max(per_synset)) +-print('min avg max hdn', min(per_hdn), round(sum(per_hdn) / len(per_hdn), 2), max(per_hdn)) ++#for lemma, info in lemma2annotations.items(): ++# lemma_count = 0 ++# for sy_id, sy_info in info.items(): ++# lemma_count += sy_info['synset'] ++# per_sensekey.append(sy_info['synset']) ++# per_hdn.append(sy_info['hdn']) ++# ++# synset2freq[sy_id] += sy_info['synset'] ++# meanings.add(sy_id) ++# ++# per_lemma.append(lemma_count) ++ ++#print('number of unique lemmas: %s' % len(lemma2annotations)) ++#print('number of unique meanings: %s' % len(meanings)) ++#print('# min avg max total lemma', len(per_lemma), min(per_lemma), round(sum(per_lemma) / len(per_lemma), 2), max(per_lemma), sum(per_lemma)) ++#print('# min avg max total sensekey', len(per_sensekey), min(per_sensekey), round(sum(per_sensekey) / len(per_sensekey), 2), max(per_sensekey), sum(per_sensekey)) ++#print('# min avg max total synset', len(synset2freq), min(synset2freq.values()), round(sum(synset2freq.values()) / len(synset2freq), 2), max(synset2freq.values()), sum(synset2freq.values())) ++#print('# min avg max total hdn', len(per_hdn), min(per_hdn), round(sum(per_hdn) / len(per_hdn), 2), max(per_hdn), sum(per_hdn)) ++#print('# min avg max total domain', len(domain2freq), min(domain2freq.values()), round(sum(domain2freq.values()) / len(domain2freq), 2), max(domain2freq.values()), sum(domain2freq.values())) +diff --git a/scripts/wn_utils.py b/scripts/wn_utils.py +index a3f69bf..8acdd47 100644 +--- a/scripts/wn_utils.py ++++ b/scripts/wn_utils.py +@@ -1,8 +1,154 @@ +-import nltk + import itertools + from collections import defaultdict + + ++def candidate_selection(wn, ++ token, ++ target_lemma, ++ pos, ++ use_case=False, ++ use_number=False, ++ gold_lexkeys=set(), ++ case_freq=None, ++ plural_freq=None, ++ debug=False): ++ """ ++ return candidate synsets of a token ++ ++ :param str targe_lemma: a token, e.g. Congress ++ :param str pos: supported: n ++ ++ :param bool use_case: if set to True, ++ only synsets are returned that contain the token in upper case ++ :param str gold_lexkeys: {'congress%1:14:00::'} ++ ++ :rtype: tuple ++ :return: (candidate_synsets, ++ new_candidate_synsets, ++ gold_in_candidates) ++ """ ++ # assertions on input arguments ++ if use_case: ++ assert case_freq is not None, 'case_freq should not be None' ++ ++ if use_number: ++ assert plural_freq is not None, 'plural_freq should not be None' ++ ++ apply_morph_strategy = True ++ ++ # check if candidate_synsets without morphological information is monosemous ++ candidate_synsets = wn.synsets(target_lemma, pos) ++ if len(candidate_synsets) == 1: ++ apply_morph_strategy = False ++ ++ new_candidate_synsets = [] ++ gold_in_candidates = False ++ ++ if debug: ++ print(candidate_synsets) ++ ++ for synset in candidate_synsets: ++ ++ add = False ++ ++ if all([use_number, ++ apply_morph_strategy]): ++ ++ key = (target_lemma.lower(), pos) ++ lemma_plural_freq = dict() ++ if key in plural_freq: ++ lemma_plural_freq = plural_freq[(target_lemma.lower(), pos)] ++ ++ plural_match = False ++ for lemma in synset.lemmas(): ++ if lemma.key() in lemma_plural_freq: ++ plural_match = True ++ ++ if plural_match: ++ add = True ++ ++ if all([use_case, ++ apply_morph_strategy]): ++ ++ # check synset_lemma ++ capital_lemma_match = any([lemma.name() == token ++ for lemma in synset.lemmas()]) ++ ++ # check sense annotated corpus ++ key = (target_lemma.lower(), pos) ++ lemma_case_freq = dict() ++ if key in case_freq: ++ lemma_case_freq = case_freq[(target_lemma.lower(), pos)] ++ ++ freq_match = False ++ for lemma in synset.lemmas(): ++ if lemma.key() in lemma_case_freq: ++ freq_match = True ++ ++ if any([capital_lemma_match, # whether lemma matches with token ++ freq_match]): # whether lemma of sensekey is used with capital ++ add = True ++ ++ if add: ++ new_candidate_synsets.append(synset) ++ ++ # check if gold in candidate ++ lexkeys = {lemma.key() for lemma in synset.lemmas()} ++ if any(gold_key in lexkeys ++ for gold_key in gold_lexkeys): ++ gold_in_candidates = True ++ ++ # if no synsets remain, use original ones ++ if not new_candidate_synsets: ++ new_candidate_synsets = candidate_synsets ++ ++ return candidate_synsets, new_candidate_synsets, gold_in_candidates ++ ++ ++ ++def get_synset2domain(path_wn20_to_domain, ++ path_wn20_to_wn30): ++ """ ++ create mapping between wn30 and domain and vice versa ++ ++ :param str path_wn20_to_domain: wn-domains-3.2-20070223 file ++ :param str path_wn20_to_wn30: wn20-30.noun file from upc mappings ++ ++ :rtype: tuple ++ :return: (wn30_domain, domain_wn30) ++ """ ++ wn30_domain = dict() ++ domain_wn30 = defaultdict(set) ++ ++ wn20_wn30 = dict() ++ with open(path_wn20_to_wn30) as infile: ++ for line in infile: ++ split = line.strip().split() ++ if len(split) == 3: ++ offset_20, *values = line.strip().split() ++ offset_30 = '' ++ conf = 0.0 ++ for index in range(0, len(values), 2): ++ an_offset = values[index] ++ a_conf = float(values[index + 1]) ++ if a_conf > conf: ++ offset_30 = an_offset ++ conf = a_conf ++ wn20_wn30[offset_20 + '-n'] = offset_30 + '-n' ++ ++ with open(path_wn20_to_domain) as infile: ++ for line in infile: ++ sy_id, domain = line.strip().split('\t') ++ if all([sy_id in wn20_wn30, ++ sy_id.endswith('n')]): ++ wn30 = wn20_wn30[sy_id] ++ ++ wn30_domain['eng-30-' + wn30] = domain ++ domain_wn30[domain].add('eng-30-' + wn30) ++ ++ return wn30_domain, domain_wn30 ++ ++ + def generate_training_instances(sentence_lemmas, annotations): + """ + given the lemmas in a sentence with its annotations (can be more than one) +@@ -37,6 +183,68 @@ def generate_training_instances(sentence_lemmas, annotations): + + return instances + ++ ++def generate_training_instances_v2(sentence_tokens, ++ sentence_lemmas, ++ sentence_pos, ++ annotations): ++ """ ++ given the lemmas in a sentence with its annotations (can be more than one) ++ generate all training instances for that sentence ++ ++ e.g. ++ sentence_tokens = ['the', 'man', 'meets', 'women'] ++ sentence_lemmas = ['the', 'man', 'meet', 'woman'] ++ sentence_pos = ['', 'n', 'v', 'n'] ++ annotations = [[], ['1', '2' ], ['4'], ['5', '6']] ++ ++ would result in ++ ('man', 'n', '1', ['the', 'man', 'meets', 'women'], 'the man---1 meets women', 1) ++ ('man', 'n', '2', ['the', 'man', 'meets', 'women'], 'the man---2 meets women', 1) ++ ('meet', 'v', '4', ['the', 'man', 'meets', 'women'], 'the man meets---4 women', 2) ++ ('woman', 'n', '5', ['the', 'man', 'meets', 'women'], 'the man meets women---5', 3) ++ ('woman', 'n', '6', ['the', 'man', 'meets', 'women'], 'the man meets women---6', 3) ++ ++ :param list sentence_tokens: see above ++ :param list sentence_lemmas: see above ++ :param list sentence_pos: see above ++ :param list annotations: see above ++ ++ :rtype: generator ++ :return: generator of (target_lemma, ++ target_pos, ++ token_annotation, ++ sentence_tokens, ++ training_example, ++ target_index) ++ """ ++ for target_index, token_annotations in enumerate(annotations): ++ ++ target_lemma = sentence_lemmas[target_index] ++ target_pos = sentence_pos[target_index] ++ ++ for token_annotation in token_annotations: ++ ++ if token_annotation is None: ++ continue ++ ++ a_sentence = [] ++ for index, token in enumerate(sentence_tokens): ++ ++ if index == target_index: ++ a_sentence.append(token + '---' + token_annotation) ++ else: ++ a_sentence.append(token) ++ ++ training_example = ' '.join(a_sentence) ++ ++ yield (target_lemma, ++ target_pos, ++ token_annotation, ++ sentence_tokens, ++ training_example, ++ target_index) ++ + def load_lemma_pos2offsets(path_to_index_sense): + ''' + given with index.sense from wordnet distributions such as +@@ -157,8 +365,11 @@ def synsets_graph_info(wn_instance, wn_version, lemma, pos): + synsets = set(synsets) + + if len(synsets) == 1: +- target_sy_iden = synset2identifier(synsets.pop(), wn_version) ++ sy_obj = synsets.pop() ++ target_sy_iden = synset2identifier(sy_obj, wn_version) + sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': None, ++ 'under_lcs_obj': None, ++ 'sy_obj' : sy_obj, + 'path_to_under_lcs': []} + return sy_id2under_lcs_info + +@@ -199,6 +410,28 @@ def synsets_graph_info(wn_instance, wn_version, lemma, pos): + for synset in path_to_under_lcs] + + sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': under_lcs_iden, ++ 'under_lcs_obj': under_lcs, ++ 'sy_obj' : sy1, + 'path_to_under_lcs': path_to_under_lcs_idens} + + return sy_id2under_lcs_info ++ ++ ++def get_synset2sensekeys(wn, target_lemma, pos): ++ """ ++ ++ :param str target_lemma: e.g. cat ++ :param str pos: n v a r ++ ++ :rtype: dict ++ :return: mapping from synset identifier -> sensekey ++ ++ """ ++ synset2sensekeys = dict() ++ for synset in wn.synsets(target_lemma, pos): ++ sy_id = synset2identifier(synset, '30') ++ for lemma in synset.lemmas(): ++ if lemma.key().startswith(target_lemma + '%'): ++ synset2sensekeys[sy_id] = lemma.key() ++ ++ return synset2sensekeys +diff --git a/test-lstm.py b/test-lstm.py +index 80d9033..85ab1c5 100644 +--- a/test-lstm.py ++++ b/test-lstm.py +@@ -3,17 +3,20 @@ import tensorflow as tf + from collections import defaultdict + import argparse + import pickle ++from datetime import datetime + + parser = argparse.ArgumentParser(description='Trains meaning embeddings based on precomputed LSTM model') + parser.add_argument('-m', dest='model_path', required=True, help='path to model trained LSTM model') +-# model_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/lstm-wsd-small' ++# model_path = 'output/lstm-wsd-small' + parser.add_argument('-v', dest='vocab_path', required=True, help='path to LSTM vocabulary') +-#vocab_path = '/var/scratch/mcpostma/wsd-dynamic-sense-vector/output/gigaword.1m-sents-lstm-wsd.index.pkl' ++#vocab_path = 'gigaword.1m-sents-lstm-wsd.index.pkl' + parser.add_argument('-i', dest='input_path', required=True, help='input path with sense annotated sentences') + parser.add_argument('-o',dest='output_path', required=True, help='path where sense embeddings will be stored') + parser.add_argument('-t', dest='max_lines', required=True, help='maximum number of lines you want to train on') + args = parser.parse_args() + ++print('loaded arguments for training meaning embeddings') ++ + def ctx_embd_input(sentence): + """ + given a annotated sentence, return +@@ -39,21 +42,31 @@ def ctx_embd_input(sentence): + return tokens, annotation_indices + + vocab = np.load(args.vocab_path) ++print('loaded vocab') ++ + synset2context_embds = defaultdict(list) +- ++meaning_freqs = defaultdict(int) ++ + with tf.Session() as sess: # your session object + saver = tf.train.import_meta_graph(args.model_path + '.meta', clear_devices=True) + saver.restore(sess, args.model_path) + predicted_context_embs = sess.graph.get_tensor_by_name('Model/predicted_context_embs:0') +- x = sess.graph.get_tensor_by_name('Model/x:0') ++ x = sess.graph.get_tensor_by_name('Model/Placeholder:0') + + with open(args.input_path) as infile: + for counter, line in enumerate(infile): + if counter >= int(args.max_lines): + break ++ if counter % 1000 == 0: ++ print(counter, datetime.now()) + sentence = line.strip() + tokens, annotation_indices = ctx_embd_input(sentence) + for index, synset_id in annotation_indices: ++ ++ #if '_' in synset_id: ++ # base_synset, synset_id = synset_id.split('_') ++ ++ meaning_freqs[synset_id] += 1 + target_id = vocab[''] + sentence_as_ids = [vocab.get(w) or vocab[''] for w in tokens] + sentence_as_ids[index] = target_id +@@ -68,3 +81,6 @@ for synset, embeddings in synset2context_embds.items(): + + with open(args.output_path, 'wb') as outfile: + pickle.dump(synset2avg_embedding, outfile) ++ ++with open(args.output_path + '.freq', 'wb') as outfile: ++ pickle.dump(meaning_freqs, outfile) + + + + + + + + +diff --git a/das5/train-lstm-wsd-full-data-google-model.job b/das5/train-lstm-wsd-full-data-google-model.job +new file mode 100755 +index 0000000..3c35b70 +--- /dev/null ++++ b/das5/train-lstm-wsd-full-data-google-model.job +@@ -0,0 +1,17 @@ ++#!/bin/bash ++#SBATCH --time=72:00:00 ++#SBATCH -C TitanX ++#SBATCH --gres=gpu:1 ++ ++module load cuda80/toolkit ++module load cuda80/blas ++module load cuda80 ++module load cuDNN ++ ++echo -n 'Started: ' && date ++ ++python3 -u train-lstm-wsd.py --model google \ ++ --data_path output/gigaword-lstm-wsd \ ++ --save_path output/lstm-wsd-gigaword-google ++ ++echo -n 'Finished: ' && date +diff --git a/das5/train-lstm-wsd-full-data-large-model.job b/das5/train-lstm-wsd-full-data-large-model.job +new file mode 100755 +index 0000000..aca457d +--- /dev/null ++++ b/das5/train-lstm-wsd-full-data-large-model.job +@@ -0,0 +1,18 @@ ++#!/bin/bash ++#SBATCH --time=72:00:00 ++#SBATCH -C TitanX ++#SBATCH --gres=gpu:1 ++ ++module load cuda80/toolkit ++module load cuda80/blas ++module load cuda80 ++module load cuDNN ++ ++echo -n 'Started: ' && date ++ ++python3 -u train-lstm-wsd.py --model large \ ++ --data_path output/gigaword-lstm-wsd \ ++ --save_path output/lstm-wsd-gigaword-large ++ ++echo -n 'Finished: ' && date ++ +\ No newline at end of file + + + + + +diff --git a/train-lstm-wsd-full-data-google-model.job b/train-lstm-wsd-full-data-google-model.job +deleted file mode 100755 +index 3c35b70..0000000 +--- a/train-lstm-wsd-full-data-google-model.job ++++ /dev/null +@@ -1,17 +0,0 @@ +-#!/bin/bash +-#SBATCH --time=72:00:00 +-#SBATCH -C TitanX +-#SBATCH --gres=gpu:1 +- +-module load cuda80/toolkit +-module load cuda80/blas +-module load cuda80 +-module load cuDNN +- +-echo -n 'Started: ' && date +- +-python3 -u train-lstm-wsd.py --model google \ +- --data_path output/gigaword-lstm-wsd \ +- --save_path output/lstm-wsd-gigaword-google +- +-echo -n 'Finished: ' && date +diff --git a/train-lstm-wsd-full-data-large-model.job b/train-lstm-wsd-full-data-large-model.job +deleted file mode 100755 +index aca457d..0000000 +--- a/train-lstm-wsd-full-data-large-model.job ++++ /dev/null +@@ -1,18 +0,0 @@ +-#!/bin/bash +-#SBATCH --time=72:00:00 +-#SBATCH -C TitanX +-#SBATCH --gres=gpu:1 +- +-module load cuda80/toolkit +-module load cuda80/blas +-module load cuda80 +-module load cuDNN +- +-echo -n 'Started: ' && date +- +-python3 -u train-lstm-wsd.py --model large \ +- --data_path output/gigaword-lstm-wsd \ +- --save_path output/lstm-wsd-gigaword-large +- +-echo -n 'Finished: ' && date +- +\ No newline at end of file +diff --git a/train-lstm-wsd.py b/train-lstm-wsd.py +index f6871ee..6131dc9 100644 +--- a/train-lstm-wsd.py ++++ b/train-lstm-wsd.py +@@ -14,149 +14,47 @@ import numpy as np + import tensorflow as tf + from tensorflow.python.client import timeline + import sys +-from model import WSDModelTrain, WSDModelEvaluate, DummyModelTrain ++from model import WSDModel, train_model ++from configs import get_config ++import random + + flags = tf.flags + logging = tf.logging + ++flags.DEFINE_integer("seed", 192, ++ "A random seed to make sure the experiment is repeatable") + flags.DEFINE_string("model", "small", +- "A type of model. Possible options are: small, medium, large, google.") ++ "A type of model. Possible options are: small, medium, large, google.") + flags.DEFINE_string("data_path", None, +- "Where the training/test data is stored.") ++ "Where the training/valid data is stored.") ++flags.DEFINE_string("dev_path", '', ++ "Where the valid data is stored, if it cannot be inferred from data_path.") ++flags.DEFINE_string("vocab_path", '', ++ "Where the vocabulary is stored, if it cannot be inferred from data_path.") + flags.DEFINE_string("save_path", None, + "Model output directory.") +-flags.DEFINE_bool("use_fp16", False, +- "Train using 16-bit floats instead of 32bit floats") + flags.DEFINE_bool("trace_timeline", False, + "Trace execution time to find out bottlenecks.") + FLAGS = flags.FLAGS + +- +-def data_type(): +- return tf.float16 if FLAGS.use_fp16 else tf.float32 +- +- +-class SmallConfig(object): +- """Small config.""" +- init_scale = 0.1 +- learning_rate = 0.1 +- max_grad_norm = 5 +- hidden_size = 100 +- max_epoch = 100 +- emb_dims = 10 +- +- +-class MediumConfig(object): +- """Medium config.""" +- init_scale = 0.05 +- learning_rate = 0.1 +- max_grad_norm = 5 +- hidden_size = 200 +- max_epoch = 500 +- emb_dims = 100 +- +- +-class LargeConfig(object): +- """Large config.""" +- init_scale = 0.04 +- learning_rate = 0.1 +- max_grad_norm = 10 +- hidden_size = 512 +- max_epoch = 1000 +- emb_dims = 128 +- +- +-class GoogleConfig(object): +- """Large config.""" +- init_scale = 0.04 +- learning_rate = 0.1 +- max_grad_norm = 5 +- hidden_size = 2048 +- max_epoch = 2000 +- emb_dims = 512 +- +- +-class TestConfig(object): +- """Tiny config, for testing.""" +- init_scale = 0.1 +- learning_rate = 0.1 +- max_grad_norm = 1 +- hidden_size = 2 +- max_epoch = 1 +- batch_size = 20 +- +-def get_config(): +- if FLAGS.model == "small": +- return SmallConfig() +- elif FLAGS.model == "medium": +- return MediumConfig() +- elif FLAGS.model == "large": +- return LargeConfig() +- elif FLAGS.model == "google": +- return GoogleConfig() +- elif FLAGS.model == "test": +- return TestConfig() +- else: +- raise ValueError("Invalid model: %s", FLAGS.model) +- +-def load_data(): +- sys.stderr.write('Loading data...\n') +- full_vocab = np.load(FLAGS.data_path + '.index.pkl') +- train = np.load(FLAGS.data_path + '.train.npz') +- train_batches = [] +- num_batches = len(train.keys()) +- for i in range(num_batches): +- sentences = train['batch%d' %i] +- batch_vocab, inverse = np.unique(sentences, return_inverse=True) +- outputs = inverse.reshape(sentences.shape) +- sys.stderr.write('Batch %d of %d vocab size: %d (%.2f%% of original)\n' +- %(i, num_batches, batch_vocab.size, batch_vocab.size*100.0/len(full_vocab))) +- train_batches.append((sentences, outputs, batch_vocab)) +- dev = np.load(FLAGS.data_path + '.dev.npz') +- sys.stderr.write('Loading data... Done.\n') +- return full_vocab, train_batches, dev['data'], dev['lens'] +- + def main(_): ++ random.seed(FLAGS.seed) ++ np.random.seed(random.randint(0, 10**6)) ++ tf.set_random_seed(random.randint(0, 10**6)) + if not FLAGS.data_path: + raise ValueError("Must set --data_path to the base path of " + "prepared input (e.g. output/gigaword)") +- vocab, train_batches, dev_data, dev_lens = load_data() +- target_id = vocab[''] +- config = get_config() +- config.vocab_size = len(vocab) ++ config = get_config(FLAGS) + with tf.Graph().as_default(): + initializer = tf.random_uniform_initializer(-config.init_scale, + config.init_scale) + with tf.variable_scope("Model", reuse=None, initializer=initializer): +- m_train = WSDModelTrain(config, data_type()) +- with tf.variable_scope("Model", reuse=True, initializer=initializer): +- m_evaluate = WSDModelEvaluate(config, data_type()) +- m_train.print_device_placement() +- with tf.Session() as session: +- saver = tf.train.Saver() +- start_time = time.time() +- sys.stdout.write("Initializing variables.... ") +- session.run(tf.global_variables_initializer()) +- sys.stdout.write("Done.\n") +- best_cost = None +- for i in range(config.max_epoch): +- # only turn it on after 5 epochs because first epochs spend time +- # on GPU initialization routines +- if FLAGS.trace_timeline and i == 5: +- m_train.trace_timeline() # start tracing timeline +- print("Epoch #%d:" % (i + 1)) +-# train_cost = 0 # for debugging +- train_cost = m_train.train_epoch(session, train_batches, target_id, verbose=True) +- dev_cost, hit_at_100 = m_evaluate.measure_dev_cost(session, dev_data, dev_lens, target_id) +- print("Epoch #%d finished:" %(i + 1)) +- print("\tTrain cost: %.3f" %train_cost) +- print("\tDev cost: %.3f, hit@100: %.1f%%" %(dev_cost, hit_at_100)) +- if best_cost is None or dev_cost < best_cost: +- best_cost = dev_cost +-# save_start = time.time() +- print("\tSaved best model to %s" %saver.save(session, FLAGS.save_path)) +-# print("\tTime on saving: %f sec" %(time.time()-save_start)) +- print("\tElapsed time: %.1f minutes" %((time.time()-start_time)/60)) ++ m_train = WSDModel(config, optimized=True) ++ with tf.variable_scope("Model", reuse=True): ++ m_evaluate = WSDModel(config, reuse_variables=True) ++# m_train.print_device_placement() # for debugging ++ train_model(m_train, m_evaluate, FLAGS, config) ++ + if FLAGS.trace_timeline: + tl = timeline.Timeline(m_train.run_metadata.step_stats) + ctf = tl.generate_chrome_trace_format() \ No newline at end of file diff --git a/prepare-lstm-wsd.py b/prepare-lstm-wsd.py index 52db01c..2bb3fe6 100644 --- a/prepare-lstm-wsd.py +++ b/prepare-lstm-wsd.py @@ -24,7 +24,7 @@ from random import Random from collections import Counter from utils import progress, count_lines_fast -from configs import preprocessed_gigaword_path, output_dir +from configs import output_dir from version import version dev_sents = 20000 # absolute maximum @@ -34,12 +34,7 @@ vocab_size = 10**6 min_count = 5 -inp_path = preprocessed_gigaword_path -# inp_path = 'preprocessed-data/gigaword_1m-sents.txt' # for debugging -out_dir = os.path.join('preprocessed-data', version) -out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd') - -special_symbols = ['', '', ''] +special_symbols = ['', '', '', ''] def _build_vocab(filename): sys.stderr.write('Building vocabulary...\n') @@ -76,16 +71,17 @@ def lookup_and_iter_sents(filename, word2id, include_ids=None, exclude_ids=None) words = line.strip().split() yield [word2id.get(word) or unkn_id for word in words] -def pad(sents, max_len, pad_id): - arr = np.empty((len(sents), max_len), dtype=np.int32) +def pad(sents, max_len, pad_id, eos_id): + arr = np.empty((len(sents), max_len+1), dtype=np.int32) arr.fill(pad_id) for i, s in enumerate(sents): arr[i, :len(s)] = s + arr[i, len(s)] = eos_id return arr def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1): sys.stderr.write('Dividing and padding...\n') - pad_id = word2id[''] + eos_id, pad_id = word2id[''], word2id[''] batches = {} sent_lens = [] curr_max_len = 0 @@ -95,7 +91,7 @@ def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1): include_ids, exclude_ids)): new_size = (len(curr_batch)+1) * max(curr_max_len,len(sent)) if new_size > batch_size or (max_sents > 0 and len(curr_batch) >= max_sents): - batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id) + batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id, eos_id) batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32) batch_id += 1 curr_max_len = 0 @@ -104,7 +100,7 @@ def pad_batches(inp_path, word2id, include_ids, exclude_ids, max_sents=-1): curr_batch.append(sent) sent_lens.append(len(sent)) if curr_batch: - batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id) + batches['batch%d' %batch_id] = pad(curr_batch, curr_max_len, pad_id, eos_id) batches['lens%d' %batch_id] = np.array([len(s) for s in curr_batch], dtype=np.int32) batch_id += 1 # important to count num batches correctly sent_lens = np.array(sent_lens, dtype=np.int32) @@ -150,7 +146,7 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids): new_size = (len(curr_batch_lens)+1) * max(curr_max_len,l) if new_size >= batch_size: batches['batch%d' %batch_id] = \ - np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32) + np.empty((len(curr_batch_lens), max(curr_batch_lens)+1), dtype=np.int32) batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32) batch_id += 1 curr_max_len = 0 @@ -160,13 +156,13 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids): sent2batch[sent_id] = 'batch%d' %batch_id if curr_batch_lens: batches['batch%d' %batch_id] = \ - np.empty((len(curr_batch_lens), max(curr_batch_lens)), dtype=np.int32) + np.empty((len(curr_batch_lens), max(curr_batch_lens)+1), dtype=np.int32) batches['lens%d' %batch_id] = np.array(curr_batch_lens, dtype=np.int32) batch_id += 1 # important to count num batches correctly sys.stderr.write('Calculating batch shapes... Done.\n') sys.stderr.write('Dividing and padding...\n') - pad_id = word2id[''] + eos_id, pad_id = word2id[''], word2id[''] for i in range(batch_id): batches['batch%d'%i].fill(pad_id) nonpad_count = 0 sent_counter = Counter() @@ -174,7 +170,8 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids): assert lens[sent_id] == len(sent) batch_name = sent2batch.get(sent_id) if batch_name is not None: # could be in dev set - batches[batch_name][sent_counter[batch_name],:len(sent)] = sent + batches[batch_name][sent_counter[batch_name], :len(sent)] = sent + batches[batch_name][sent_counter[batch_name], len(sent)] = eos_id nonpad_count += len(sent) sent_counter[batch_name] += 1 # check that we filled all arrays @@ -196,8 +193,7 @@ def shuffle_and_pad_batches(inp_path, word2id, dev_sent_ids): %(lens.mean(), lens.std())) return batches -def run(): - os.makedirs(out_dir, exist_ok=True) +def run(inp_path, out_path): index_path = out_path + '.index.pkl' if os.path.exists(index_path): sys.stderr.write('Reading vocabulary from %s... ' %index_path) @@ -247,4 +243,9 @@ def run(): np.savez(pc_train_path, **batches) if __name__ == '__main__': - run() + inp_path = 'preprocessed-data/694cb4d/gigaword.txt' + #inp_path = 'preprocessed-data/694cb4d/gigaword_1m-sents.txt' # for debugging + out_dir = os.path.join('preprocessed-data', version) + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, 'gigaword-for-lstm-wsd') + run(inp_path, out_path) diff --git a/process-gigaword.py b/process-gigaword.py index 7bf2b0d..10416ff 100644 --- a/process-gigaword.py +++ b/process-gigaword.py @@ -2,7 +2,7 @@ import gzip from bs4 import BeautifulSoup import spacy -from configs import gigaword_path, preprocessed_gigaword_path +from configs import gigaword_path import codecs from utils import progress from version import version @@ -32,7 +32,6 @@ def iter_sents(paragraphs): for sent in doc.sents: yield [str(tok).strip() for tok in sent] - # example_file = 'data/gigaword/gigaword_eng_5_d1/data/afp_eng/afp_eng_200112.gz' if __name__ == '__main__': diff --git a/stats_gigaword.py b/stats_gigaword.py index 193b678..a3b7c49 100644 --- a/stats_gigaword.py +++ b/stats_gigaword.py @@ -1,4 +1,4 @@ -from configs import preprocessed_gigaword_path, output_dir +from configs import output_dir from collections import Counter from nltk.stem import WordNetLemmatizer import codecs @@ -9,6 +9,7 @@ token_count = Counter() lemma_count = Counter() wordnet_lemmatizer = WordNetLemmatizer() + preprocessed_gigaword_path = 'preprocessed-data/694cb4d/gigaword.txt' with codecs.open(preprocessed_gigaword_path, 'r', 'utf-8') as f: for line_no, line in enumerate(f): for tok in line.split():