First commit

dinghanshen · May 24, 2018 · 7c4c3b1 · 7c4c3b1
1 parent bec6539
commit 7c4c3b1
Show file tree

Hide file tree

Showing 42 changed files with 13,113 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md.txt b/README.md.txt
@@ -0,0 +1,43 @@
+# SWEM (Simple Word-Embedding-based Models)
+
+This repository contains source code necessary to reproduce the results presented in the paper [Baseline Needs More Love: On Simple Word-Embedding-Based Models and Associated Pooling Mechanisms] (ACL 2018):
+
+## Prerequisite: 
+* CUDA, cudnn
+* Tensorflow (version >1.0). We used tensorflow 1.5.
+Run: `pip install -r requirements.txt` to install requirements
+
+
+## Run 
+* Run: `python eval_dbpedia_emb.py` for ontology classification on the DBpedia dataset
+* Run: `python eval_snli_emb.py` for natural language inference on the SNLI dataset
+* Run: `python eval_yahoo_emb.py` for topic categorization on the Yahoo! Answer dataset
+* Options: options can be made by changing `option` class in any of the above three files: 
+
+- `opt.emb_size`: number of word embedding dimensions.
+- `opt.drop_rate`: the keep rate of dropout layer.
+- `opt.lr`: learning rate.
+- `opt.batch_size`: number of batch size.
+- `opt.H_dis': the dimension of last hidden layer.
+
+* On a K80 GPU machine, training roughly takes about 3min each epoch and 5 epochs for Debpedia to converge, takes 50s each epoch and 20 epochs for SNLI, and 4min each epoch and 5 epochs for the Yahoo dataset.
+
+## Data: 
+* Download from the links below and put them into a `data' folder:
+	* Ontology classification: [DBpedia (591MB)](https://drive.google.com/open?id=1EBmMise0LQu0QpO7T4a32WMFuTxAb6T0)
+	* Natural language inference: [SNLI (101MB)](https://drive.google.com/open?id=1M13UswHThZYt-ARrHg6sN7Dlel-d6BB3)
+	* Topic categorization: [Yahoo (1.7GB)](https://drive.google.com/open?id=1Dorz_CWZkHHpojVS4K4YUEhhczVLQgRc)
+
+
+## Citation 
+Please cite our paper if it helps with your research:
+
+```latex
+@inproceedings{Shen2018Baseline, 
+title={Baseline Needs More Love: On Simple Word-Embedding-Based Models and Associated Pooling Mechanisms}, 
+author={Shen, Dinghan and Wang, Guoyin and Wang, Wenlin and Renqiang Min, Martin and Su, Qinliang and Zhang, Yizhe and Li, Chunyuan and Henao, Ricardo and Carin, Lawrence}, 
+booktitle={ACL}, 
+year={2018} 
+}
+```
+For any question or suggestions, feel free to contact dinghan.shen@duke.edu
diff --git a/data_utils.py b/data_utils.py
@@ -0,0 +1,325 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for downloading data from WMT, tokenizing, vocabularies."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import re
+import tarfile
+import pdb
+
+from six.moves import urllib
+
+from tensorflow.python.platform import gfile
+import tensorflow as tf
+
+# Special vocabulary symbols - we always put them at the start.
+_PAD = b"_PAD"
+_GO = b"_GO"
+_EOS = b"_EOS"
+_UNK = b"_UNK"
+_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
+
+PAD_ID = 0
+GO_ID = 1
+EOS_ID = 2
+UNK_ID = 3
+
+# Regular expressions used to tokenize.
+_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
+_DIGIT_RE = re.compile(br"\d")
+
+# URLs for WMT data.
+_WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
+_WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"
+
+
+def maybe_download(directory, filename, url):
+  """Download filename from url unless it's already in directory."""
+  if not os.path.exists(directory):
+    print("Creating directory %s" % directory)
+    os.mkdir(directory)
+  filepath = os.path.join(directory, filename)
+  if not os.path.exists(filepath):
+    print("Downloading %s to %s" % (url, filepath))
+    filepath, _ = urllib.request.urlretrieve(url, filepath)
+    statinfo = os.stat(filepath)
+    print("Successfully downloaded", filename, statinfo.st_size, "bytes")
+  return filepath
+
+
+def gunzip_file(gz_path, new_path):
+  """Unzips from gz_path into new_path."""
+  print("Unpacking %s to %s" % (gz_path, new_path))
+  with gzip.open(gz_path, "rb") as gz_file:
+    with open(new_path, "wb") as new_file:
+      for line in gz_file:
+        new_file.write(line)
+
+
+def get_wmt_enfr_train_set(directory):
+  """Download the WMT en-fr training corpus to directory unless it's there."""
+  train_path = os.path.join(directory, "giga-fren.release2.fixed")
+  if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")):
+    corpus_file = maybe_download(directory, "training-giga-fren.tar",
+                                 _WMT_ENFR_TRAIN_URL)
+    print("Extracting tar file %s" % corpus_file)
+    with tarfile.open(corpus_file, "r") as corpus_tar:
+      corpus_tar.extractall(directory)
+    gunzip_file(train_path + ".fr.gz", train_path + ".fr")
+    gunzip_file(train_path + ".en.gz", train_path + ".en")
+  return train_path
+
+
+def get_wmt_enfr_dev_set(directory):
+  """Download the WMT en-fr training corpus to directory unless it's there."""
+  dev_name = "newstest2013"
+  dev_path = os.path.join(directory, dev_name)
+  if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")):
+    dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL)
+    print("Extracting tgz file %s" % dev_file)
+    with tarfile.open(dev_file, "r:gz") as dev_tar:
+      fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
+      en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
+      fr_dev_file.name = dev_name + ".fr"  # Extract without "dev/" prefix.
+      en_dev_file.name = dev_name + ".en"
+      dev_tar.extract(fr_dev_file, directory)
+      dev_tar.extract(en_dev_file, directory)
+  return dev_path
+
+
+def basic_tokenizer(sentence):
+  """Very basic tokenizer: split the sentence into a list of tokens."""
+  words = []
+  for space_separated_fragment in sentence.strip().split():
+    words.extend(_WORD_SPLIT.split(space_separated_fragment))
+  return [w for w in words if w]
+
+
+def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
+                      tokenizer=None, normalize_digits=True):
+  """Create vocabulary file (if it does not exist yet) from data file.
+
+  Data file is assumed to contain one sentence per line. Each sentence is
+  tokenized and digits are normalized (if normalize_digits is set).
+  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
+  We write it to vocabulary_path in a one-token-per-line format, so that later
+  token in the first line gets id=0, second line gets id=1, and so on.
+
+  Args:
+    vocabulary_path: path where the vocabulary will be created.
+    data_path: data file that will be used to create vocabulary.
+    max_vocabulary_size: limit on the size of the created vocabulary.
+    tokenizer: a function to use to tokenize each data sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+  """
+  if not gfile.Exists(vocabulary_path):
+    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
+    vocab = {}
+    #pdb.set_trace()
+    with gfile.GFile(data_path, mode="rb") as f:
+      counter = 0
+      for line in f:
+        counter += 1
+        if counter % 100000 == 0:
+          print("  processing line %d" % counter)
+        line = tf.compat.as_bytes(line)
+        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
+        for w in tokens:
+          word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
+          if word in vocab:
+            vocab[word] += 1
+          else:
+            vocab[word] = 1
+      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
+      if len(vocab_list) > max_vocabulary_size:
+        vocab_list = vocab_list[:max_vocabulary_size]
+      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
+        for w in vocab_list:
+          vocab_file.write(w + b"\n")
+
+
+def initialize_vocabulary(vocabulary_path):
+  """Initialize vocabulary from file.
+
+  We assume the vocabulary is stored one-item-per-line, so a file:
+    dog
+    cat
+  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
+  also return the reversed-vocabulary ["dog", "cat"].
+
+  Args:
+    vocabulary_path: path to the file containing the vocabulary.
+
+  Returns:
+    a pair: the vocabulary (a dictionary mapping string to integers), and
+    the reversed vocabulary (a list, which reverses the vocabulary mapping).
+
+  Raises:
+    ValueError: if the provided vocabulary_path does not exist.
+  """
+  if gfile.Exists(vocabulary_path):
+    rev_vocab = []
+    with gfile.GFile(vocabulary_path, mode="rb") as f:
+      rev_vocab.extend(f.readlines())
+    rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
+    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
+    idtoword = dict([(y, x) for (y, x) in enumerate(rev_vocab)])
+    return vocab, idtoword
+  else:
+    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
+
+
+def sentence_to_token_ids(sentence, vocabulary,
+                          tokenizer=None, normalize_digits=True):
+  """Convert a string to list of integers representing token-ids.
+
+  For example, a sentence "I have a dog" may become tokenized into
+  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
+  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
+
+  Args:
+    sentence: the sentence in bytes format to convert to token-ids.
+    vocabulary: a dictionary mapping tokens to integers.
+    tokenizer: a function to use to tokenize each sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+
+  Returns:
+    a list of integers, the token-ids for the sentence.
+  """
+
+  if tokenizer:
+    words = tokenizer(sentence)
+  else:
+    words = basic_tokenizer(sentence)
+  if not normalize_digits:
+    return [vocabulary.get(w, UNK_ID) for w in words]
+  # Normalize digits by 0 before looking words up in the vocabulary.
+  return [vocabulary.get(_DIGIT_RE.sub(b"0", w), UNK_ID) for w in words]
+
+
+def data_to_token_ids(data_path, target_path, vocabulary_path,
+                      tokenizer=None, normalize_digits=True):
+  """Tokenize data file and turn into token-ids using given vocabulary file.
+
+  This function loads data line-by-line from data_path, calls the above
+  sentence_to_token_ids, and saves the result to target_path. See comment
+  for sentence_to_token_ids on the details of token-ids format.
+
+  Args:
+    data_path: path to the data file in one-sentence-per-line format.
+    target_path: path where the file with token-ids will be created.
+    vocabulary_path: path to the vocabulary file.
+    tokenizer: a function to use to tokenize each sentence;
+      if None, basic_tokenizer will be used.
+    normalize_digits: Boolean; if true, all digits are replaced by 0s.
+  """
+  if not gfile.Exists(target_path):
+    print("Tokenizing data in %s" % data_path)
+    vocab, _ = initialize_vocabulary(vocabulary_path)
+    with gfile.GFile(data_path, mode="rb") as data_file:
+      with gfile.GFile(target_path, mode="w") as tokens_file:
+        counter = 0
+        for line in data_file:
+          counter += 1
+          if counter % 100000 == 0:
+            print("  tokenizing line %d" % counter)
+          token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab,
+                                            tokenizer, normalize_digits)
+          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
+
+
+def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
+  """Get WMT data into data_dir, create vocabularies and tokenize data.
+
+  Args:
+    data_dir: directory in which the data sets will be stored.
+    en_vocabulary_size: size of the English vocabulary to create and use.
+    fr_vocabulary_size: size of the French vocabulary to create and use.
+    tokenizer: a function to use to tokenize each data sentence;
+      if None, basic_tokenizer will be used.
+
+  Returns:
+    A tuple of 6 elements:
+      (1) path to the token-ids for English training data-set,
+      (2) path to the token-ids for French training data-set,
+      (3) path to the token-ids for English development data-set,
+      (4) path to the token-ids for French development data-set,
+      (5) path to the English vocabulary file,
+      (6) path to the French vocabulary file.
+  """
+  # Get wmt data to the specified directory.
+  train_path = get_wmt_enfr_train_set(data_dir)
+  dev_path = get_wmt_enfr_dev_set(data_dir)
+
+  from_train_path = train_path + ".en"
+  to_train_path = train_path + ".fr"
+  from_dev_path = dev_path + ".en"
+  to_dev_path = dev_path + ".fr"
+  return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
+                      fr_vocabulary_size, tokenizer)
+
+
+def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
+                 to_vocabulary_size, tokenizer=None):
+  """Preapre all necessary files that are required for the training.
+
+    Args:
+      data_dir: directory in which the data sets will be stored.
+      from_train_path: path to the file that includes "from" training samples.
+      to_train_path: path to the file that includes "to" training samples.
+      from_dev_path: path to the file that includes "from" dev samples.
+      to_dev_path: path to the file that includes "to" dev samples.
+      from_vocabulary_size: size of the "from language" vocabulary to create and use.
+      to_vocabulary_size: size of the "to language" vocabulary to create and use.
+      tokenizer: a function to use to tokenize each data sentence;
+        if None, basic_tokenizer will be used.
+
+    Returns:
+      A tuple of 6 elements:
+        (1) path to the token-ids for "from language" training data-set,
+        (2) path to the token-ids for "to language" training data-set,
+        (3) path to the token-ids for "from language" development data-set,
+        (4) path to the token-ids for "to language" development data-set,
+        (5) path to the "from language" vocabulary file,
+        (6) path to the "to language" vocabulary file.
+    """
+  # Create vocabularies of the appropriate sizes.
+  to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
+  from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
+  create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
+  create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
+
+  # Create token ids for the training data.
+  to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
+  from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
+  data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
+  data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
+
+  # Create token ids for the development data.
+  to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
+  from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
+  data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
+  data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)
+
+  return (from_train_ids_path, to_train_ids_path,
+          from_dev_ids_path, to_dev_ids_path,
+          from_vocab_path, to_vocab_path)