add <eos> to prepare-lstm-wsd

cltl · Jan 19, 2018 · 36b6246 · 36b6246
1 parent 1b491b2
commit 36b6246
Show file tree

Hide file tree

Showing 10 changed files with 1,824 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -104,11 +104,24 @@ This creates a development set for the label propagation:
 a) annotated corpus: pwgc
 b) unannotated corpus: omsti
 
-#### Reproduce variation experiment
+#### Model size experiements
 
-0. `git checkout a453bc1`
-1. Pre-process GigaWord into plain text: `sbatch cartesius/process-gigaword.job`
-2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
+Notice that there was uncertainty about the real version that produce h2048p512
+and h512p128, see `difference-edited.txt` for a comparison with a recent version.
+
+1. h=2048, p=512: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-google-model.job`
+2. h=512, p=128: `git checkout 354acc1cfdd542142490afe40447cb6f40d2fd7c && ./train-lstm-wsd-full-data-large-model.job`
+3. h=512, p=64: see `exp-h256p64.sh` in "stability" section
+4. h=100, p=10: see `exp-variation*.job` in "stability" section
+
+#### Reproduce variation/stability experiments
+
+These experiments measure how much the performance is affected by the randomness
+in training. Basically, we train smaller models many times, each time with 
+a different (but fixed) random seed.
+
+1. Pre-process GigaWord into plain text: `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
+2. More preprocessing to make binary files: `git checkout a453bc1 && sbatch cartesius/prepare-lstm-wsd.job`
 0. `git checkout ce8a024`
 1. Run at the same time: `sbatch cartesius/exp-variation1.job` and `cartesius/sbatch exp-variation2.job`
 0. `git checkout a74bda6`
@@ -118,18 +131,18 @@ b) unannotated corpus: omsti
 2. When everything finishes, do `git checkout 42bc700` 
 3. Run `sbatch cartesius/exp-variation-score.job`
 
-#### Reproduce optimization experiment
+#### Reproduce (training speed) optimization experiment
 
+1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
 0. `git checkout a74bda6`
-1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job`
 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
 3. `git checkout e93fdb2`
 4. Run in parallel: `sbatch cartesius/exp-optimization{i}.job` where i=1,2,3,4
 
 #### Data size experiment
 
+1. Pre-process GigaWord into plain text (if you haven't done so): `git checkout 2b0934c && sbatch cartesius/process-gigaword.job`
 0. `git checkout a74bda6`
-1. Pre-process GigaWord into plain text (if you haven't done so): `sbatch cartesius/process-gigaword.job`
 2. More preprocessing to make binary files: `sbatch cartesius/prepare-lstm-wsd.job`
 3. `git checkout 4e4a04a`
 4. Run `sbatch cartesius/exp-data-size.job {i}` with i="01",10,25,50,75

diff --git a/compile_results.py b/compile_results.py
@@ -11,6 +11,8 @@
 import matplotlib.pyplot as plt
 from sklearn.linear_model.base import LinearRegression
 import configs
+from configs import SmallConfig, H256P64, LargeConfig, GoogleConfig,\
+    DefaultConfig
 
 ModelPerformance = namedtuple('ModelPerformance', ['name', 'semcor', 'mun'])
 
@@ -118,17 +120,20 @@ def draw_data_size_vs_performance_chart():
     print('Extrapolated data size:')
     print(lr.predict([[0.75], [0.8]]))
 
+def compute_num_params(vocab_size, p, h):
+    return (vocab_size*p*2 + # input and output embeddings
+            p*h + h*h + h + # input gates
+            p*h + h*h + h + # candidate states
+            p*h + h*h + h + # forget gates
+            p*h + h*h + h*h + h + # output gates
+            p*h # context layer
+            )    
+
 def draw_capacity_vs_performance_chart():
     ''' Create figure for paper '''
     df = pd.read_csv('output/capacity_vs_performance.csv')
     vocab_size = configs.DefaultConfig.vocab_size
-    df['num_params'] = (vocab_size*df['p']*2 + # input and output embeddings
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # input gates
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # candidate states
-                        df['p']*df['h'] + df['h']*df['h'] + df['h'] + # forget gates
-                        df['p']*df['h'] + df['h']*df['h'] + df['h']*df['h'] + df['h'] + # output gates
-                        df['p']*df['h'] # context layer
-                        )
+    df['num_params'] = compute_num_params(vocab_size, df['p'], df['h'])
     print(df)
     with PdfPages('output/capacity_vs_performance.pdf') as pdf:
         semcor_handle, = plt.plot(df['num_params'], df['semcor'], label='SemEval13 (T: SemCor)')
@@ -149,8 +154,18 @@ def draw_capacity_vs_performance_chart():
 #     print('Extrapolated data size:')
 #     print(lr.predict([[0.75], [0.8]]))
 
+def report_model_params():
+    v = DefaultConfig.vocab_size
+    models = [SmallConfig, H256P64, LargeConfig, GoogleConfig]
+    table = [['%.0fM' %(v/10**6), m.emb_dims, m.hidden_size, 
+              "%.0fM" %(compute_num_params(v, m.emb_dims, m.hidden_size)/10**6)]
+              for m in models]
+    df = pd.DataFrame(table, columns=['Vocab.', 'p', 'h', '#params'])
+    print(df.to_latex(index=False))
+
 if __name__ == '__main__':
 #     report_wsd_performance_vs_data_size()
 #     variation_experiment()
 #     draw_data_size_vs_performance_chart()
-    draw_capacity_vs_performance_chart()
+#     draw_capacity_vs_performance_chart()
+    report_model_params()
diff --git a/configs.py b/configs.py
@@ -5,7 +5,6 @@
 os.makedirs(output_dir, exist_ok=True)
 
 gigaword_path = 'data/gigaword'
-preprocessed_gigaword_path = os.path.join('preprocessed-data', 'gigaword.txt')
 
 class DefaultConfig(object):
     vocab_size = 10**6 + 3

diff --git a/das5/prepare-lstm-wsd.job b/das5/prepare-lstm-wsd.job
@@ -0,0 +1,4 @@
+#!/bin/bash
+#SBATCH --time=24:00:00
+
+python3 -u prepare-lstm-wsd.py
diff --git a/das5/process-gigaword.job b/das5/process-gigaword.job
@@ -2,4 +2,3 @@
 #SBATCH --time=24:00:00
 
 python3 -u process-gigaword.py
-python3 -u prepare-lstm-wsd.py
diff --git a/diary-minh2.md b/diary-minh2.md
@@ -260,6 +260,39 @@ job:
     [minhle@gcn40 wsd-dynamic-sense-vector]$ tail -f output/`python3 version.py`/exp-variation-score.job.out
     ...
 
+## Thu 7 Dec
+
+Worked on the paper. Data size 25% experiment has finished. Tried to run the
+newest evaluation script on it but no GPU machine is available yet.
+
+       42bc700..0a0d02b  master     -> origin/master
+    First, rewinding head to replay your work on top of it...
+    Fast-forwarded master to 0a0d02b4538dcf7322742e32e367a90ec1055899.
+    [minhle@int2 wsd-dynamic-sense-vector]$ sbatch cartesius/eval-data-size.job
+    Submitted batch job 3820439
+
+## Fri 19 Dec
+
+Meeting with Jacopo+Marten. Jacopo would like to retrain everything with <eos>
+token. Checked everything again. There doesn't seem to be big difference (that
+I don't know of) between the version that produced current reported results
+and a more recent version. Let's try.
+
+Added `<eos>` to the preparation script.
+
+I'll also need to add it to the evaluation scripts. 
+
+    >>> from collections import Counter
+    >>> c = Counter()
+    >>> with open('preprocessed-data/694cb4d/gigaword.txt') as f:
+          for sent in f:
+            c[sent.strip().split()[-1]] += 1
+    >>> c.most_common(10)
+    [('.', 141537114), ("''", 7066432), ('"', 7015844), (')', 2214057), ('_', 1964897), (':', 1605763), ('?', 1486728), ('--', 774285), ("'", 648803), ('...', 434971)]
+    >>> total = sum(c.values())
+    >>> [(tok, cnt/total) for tok, cnt in c.most_common(10)]
+    [('.', 0.8052320716307731), ("''", 0.04020230113211145), ('"', 0.039914496196088396), (')', 0.012596199360251295), ('_', 0.01117867983270516), (':', 0.00913549690604858), ('?', 0.008458283721904037), ('--', 0.004405057422483782), ("'", 0.00369116600590189), ('...', 0.002474634316970099)]
+
 
 TODO: docker image
 
@@ -271,8 +304,8 @@ TODO: docker image
 5. [x] for 25 Oct: list of all experiments for the reproduction paper
 6. [x] save models of every epoch (instead of only the best one)
 6. [x] Read more about label propagation (Zhou et al. 2004)
-7. [ ] Hyperparameter tuning of label propagation
+7. [x] Hyperparameter tuning of label propagation
 8. [ ] Training creates a lot of models, how to reduce it?
 9. [ ] Send code+data to Jacopo to run
-10. [ ] Polish the paper
+10. [x] Polish the arxiv paper
 11. [x] Use the same dev set for different sizes of the data.