From 166448c0aa6cb54657f677fbba8b1572bb744c09 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Thu, 24 Jul 2025 18:42:51 -0700 Subject: [PATCH 1/9] example: setup example `custom_output_files` w/o custom target yet --- examples/custom_output_files/.env | 2 + examples/custom_output_files/README.md | 62 ++ .../custom_output_files/files/1706.03762v7.md | 354 ++++++++++++ .../custom_output_files/files/1810.04805v2.md | 530 ++++++++++++++++++ examples/custom_output_files/files/rfc8259.md | 362 ++++++++++++ examples/custom_output_files/main.py | 29 + examples/custom_output_files/pyproject.toml | 13 + 7 files changed, 1352 insertions(+) create mode 100644 examples/custom_output_files/.env create mode 100644 examples/custom_output_files/README.md create mode 100644 examples/custom_output_files/files/1706.03762v7.md create mode 100644 examples/custom_output_files/files/1810.04805v2.md create mode 100644 examples/custom_output_files/files/rfc8259.md create mode 100644 examples/custom_output_files/main.py create mode 100644 examples/custom_output_files/pyproject.toml diff --git a/examples/custom_output_files/.env b/examples/custom_output_files/.env new file mode 100644 index 00000000..335f3060 --- /dev/null +++ b/examples/custom_output_files/.env @@ -0,0 +1,2 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex diff --git a/examples/custom_output_files/README.md b/examples/custom_output_files/README.md new file mode 100644 index 00000000..dcdf6e0c --- /dev/null +++ b/examples/custom_output_files/README.md @@ -0,0 +1,62 @@ +# Build text embedding and semantic search 🔍 +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) + +In this example, we will build index flow from text embedding from local markdown files, and query the index. + +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +## Steps +đŸŒ± A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) + +### Indexing Flow +Screenshot 2025-05-19 at 5 48 28 PM + +1. We will ingest a list of local files. +2. For each file, perform chunking (recursively split) and then embedding. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query +We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow. + + +## Prerequisite + +[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. + +## Run + +Install dependencies: + +```bash +pip install -e . +``` + +Setup: + +```bash +cocoindex setup main.py +``` + +Update index: + +```bash +cocoindex update main.py +``` + +Run: + +```bash +python main.py +``` + +## CocoInsight + +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. +It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight: + +``` +cocoindex server -ci main.py +``` + +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). diff --git a/examples/custom_output_files/files/1706.03762v7.md b/examples/custom_output_files/files/1706.03762v7.md new file mode 100644 index 00000000..665a1972 --- /dev/null +++ b/examples/custom_output_files/files/1706.03762v7.md @@ -0,0 +1,354 @@ +Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. + +# Attention Is All You Need + +Ashish Vaswani∗ Google Brain avaswani@google.com + +Llion Jones∗ Google Research llion@google.com + +Noam Shazeer∗ Google Brain noam@google.com + +Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu + +Niki Parmar∗ Google Research nikip@google.com + +Jakob Uszkoreit∗ Google Research usz@google.com + +Ɓukasz Kaiser∗ Google Brain lukaszkaiser@google.com + +Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com + +### Abstract + +The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data. + +∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. + +†Work performed while at Google Brain. + +‡Work performed while at Google Research. + +### 1 Introduction + +Recurrent neural networks, long short-term memory [\[13\]](#page-10-0) and gated recurrent [\[7\]](#page-10-1) neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [\[35,](#page-11-0) [2,](#page-9-0) [5\]](#page-10-2). Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [\[38,](#page-11-1) [24,](#page-10-3) [15\]](#page-10-4). + +Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states ht, as a function of the previous hidden state ht−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [\[21\]](#page-10-5) and conditional computation [\[32\]](#page-11-2), while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains. + +Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [\[2,](#page-9-0) [19\]](#page-10-6). In all but a few cases [\[27\]](#page-11-3), however, such attention mechanisms are used in conjunction with a recurrent network. + +In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. + +## 2 Background + +The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [\[16\]](#page-10-7), ByteNet [\[18\]](#page-10-8) and ConvS2S [\[9\]](#page-10-9), all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [\[12\]](#page-10-10). In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section [3.2.](#page-2-0) + +Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [\[4,](#page-9-1) [27,](#page-11-3) [28,](#page-11-4) [22\]](#page-10-11). + +End-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [\[34\]](#page-11-5). + +To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequencealigned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [\[17,](#page-10-12) [18\]](#page-10-8) and [\[9\]](#page-10-9). + +### 3 Model Architecture + +Most competitive neural sequence transduction models have an encoder-decoder structure [\[5,](#page-10-2) [2,](#page-9-0) [35\]](#page-11-0). Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [\[10\]](#page-10-13), consuming the previously generated symbols as additional input when generating the next. + +Figure 1: The Transformer - model architecture. + +The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure [1,](#page-2-1) respectively. + +### 3.1 Encoder and Decoder Stacks + +Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. We employ a residual connection [\[11\]](#page-10-14) around each of the two sub-layers, followed by layer normalization [\[1\]](#page-9-2). That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel = 512. + +Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i. + +### 3.2 Attention + +An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum + +Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel. + +of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key. + +### 3.2.1 Scaled Dot-Product Attention + +We call our particular attention "Scaled Dot-Product Attention" (Figure [2)](#page-3-0). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by √ dk, and apply a softmax function to obtain the weights on the values. + +In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V . We compute the matrix of outputs as: + +$$\text{Attention}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V \tag{l}$$ + +The two most commonly used attention functions are additive attention [\[2\]](#page-9-0), and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of √ 1 dk . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. + +While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [\[3\]](#page-9-3). We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients [4](#page-3-1) . To counteract this effect, we scale the dot products by √ 1 dk . + +### 3.2.2 Multi-Head Attention + +Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional + +4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q · k = Pdk i=1 qiki, has mean 0 and variance dk. + +output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure [2.](#page-3-0) + +Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this. + +$$\begin{aligned} \text{MultiHead}(Q, K, V) &= \text{Concat}(\text{head}_1, \dots, \text{head}_h) W^O \\ \text{where } \text{head}_i &= \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \end{aligned}$$ + +Where the projections are parameter matrices W Q i ∈ R dmodel×dk , W K i ∈ R dmodel×dk , WV i ∈ R dmodel×dv and WO ∈ R hdv×dmodel . + +In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality. + +### 3.2.3 Applications of Attention in our Model + +The Transformer uses multi-head attention in three different ways: + +- In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [\[38,](#page-11-1) [2,](#page-9-0) [9\]](#page-10-9). +- The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder. +- Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections. See Figure [2.](#page-3-0) + +### 3.3 Position-wise Feed-Forward Networks + +In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between. + +$$\text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2 \tag{2}$$ + +While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality df f = 2048. + +### 3.4 Embeddings and Softmax + +Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [\[30\]](#page-11-6). In the embedding layers, we multiply those weights by √ dmodel. + +Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types. n is the sequence length, d is the representation dimension, k is the kernel size of convolutions and r the size of the neighborhood in restricted self-attention. + +| Layer Type | Complexity per Layer | Sequential
Operations | Maximum Path Length | +|-----------------------------|-----------------------|--------------------------|---------------------| +| Self-Attention | 2
O(n
· d) | O(1) | O(1) | +| Recurrent | 2
O(n · d
) | O(n) | O(n) | +| Convolutional | 2
O(k · n · d
) | O(1) | O(logk(n)) | +| Self-Attention (restricted) | O(r · n · d) | O(1) | O(n/r) | + +### 3.5 Positional Encoding + +Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [\[9\]](#page-10-9). + +In this work, we use sine and cosine functions of different frequencies: + +$$PE_{(pos,2i)} = \sin(pos/10000^{2i/d_{\text{model}}})$$ + +$$PE_{(pos,2i+1)} = \cos(pos/10000^{2i/d_{\text{model}}})$$ + +where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, P Epos+k can be represented as a linear function of P Epos. + +We also experimented with using learned positional embeddings [\[9\]](#page-10-9) instead, and found that the two versions produced nearly identical results (see Table [3](#page-8-0) row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training. + +### 4 Why Self-Attention + +In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi , zi ∈ R d , such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata. + +One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required. + +The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [\[12\]](#page-10-10). Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types. + +As noted in Table [1,](#page-5-0) a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence + +length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [\[38\]](#page-11-1) and byte-pair [\[31\]](#page-11-7) representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work. + +A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [\[18\]](#page-10-8), increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [\[6\]](#page-10-15), however, decrease the complexity considerably, to O(k · n · d + n · d 2 ). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model. + +As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences. + +### 5 Training + +This section describes the training regime for our models. + +### 5.1 Training Data and Batching + +We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [\[3\]](#page-9-3), which has a shared sourcetarget vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [\[38\]](#page-11-1). Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens. + +### 5.2 Hardware and Schedule + +We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the bottom line of table [3)](#page-8-0), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days). + +### 5.3 Optimizer + +We used the Adam optimizer [\[20\]](#page-10-16) with ÎČ1 = 0.9, ÎČ2 = 0.98 and Ï” = 10−9 . We varied the learning rate over the course of training, according to the formula: + +$$lrate = d_{\text{model}}^{-0.5} \cdot \min(step\_num^{-0.5}, step\_num \cdot warmup\_steps^{-1.5}) \tag{3}$$ + +This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000. + +### 5.4 Regularization + +We employ three types of regularization during training: + + + +| | BLEU | | | Training Cost (FLOPs) | | | +|---------------------------------|-------|-------|------------|-----------------------|--|--| +| Model | EN-DE | EN-FR | EN-DE | EN-FR | | | +| ByteNet [18] | 23.75 | | | | | | +| Deep-Att + PosUnk [39] | | 39.2 | | 1.0 · 1020 | | | +| GNMT + RL [38] | 24.6 | 39.92 | 2.3 · 1019 | 1.4 · 1020 | | | +| ConvS2S [9] | 25.16 | 40.46 | 9.6 · 1018 | 1.5 · 1020 | | | +| MoE [32] | 26.03 | 40.56 | 2.0 · 1019 | 1.2 · 1020 | | | +| Deep-Att + PosUnk Ensemble [39] | | 40.4 | | 8.0 · 1020 | | | +| GNMT + RL Ensemble [38] | 26.30 | 41.16 | 1.8 · 1020 | 1.1 · 1021 | | | +| ConvS2S Ensemble [9] | 26.36 | 41.29 | 7.7 · 1019 | 1.2 · 1021 | | | +| Transformer (base model) | 27.3 | 38.1 | | 3.3 · 1018 | | | +| Transformer (big) | 28.4 | 41.8 | | 2.3 · 1019 | | | + +Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost. + +Residual Dropout We apply dropout [\[33\]](#page-11-9) to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1. + +Label Smoothing During training, we employed label smoothing of value Ï”ls = 0.1 [\[36\]](#page-11-10). This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score. + +### 6 Results + +### 6.1 Machine Translation + +On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table [2)](#page-7-0) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table [3.](#page-8-0) Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models. + +On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate Pdrop = 0.1, instead of 0.3. + +For the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty α = 0.6 [\[38\]](#page-11-1). These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [\[38\]](#page-11-1). + +Table [2](#page-7-0) summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU [5](#page-7-1) . + +### 6.2 Model Variations + +To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the + +5We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively. + +Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base model. All metrics are on the English-to-German translation development set, newstest2013. Listed perplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to per-word perplexities. + +| | | | | | | | train | PPL | BLEU | params | | | | +|------|-------------------------------------------|--------|------|----|-----|-----|-------|------|------|--------|-------|-------|------| +| | N | dmodel | dff | h | dk | dv | Pdrop | Ï”ls | | steps | (dev) | (dev) | ×106 | +| base | 6 | 512 | 2048 | 8 | 64 | 64 | 0.1 | 0.1 | 100K | 4.92 | 25.8 | 65 | | +| | | | | 1 | 512 | 512 | | | | 5.29 | 24.9 | | | +| | | | | 4 | 128 | 128 | | | | 5.00 | 25.5 | | | +| (A) | | | | 16 | 32 | 32 | | | | 4.91 | 25.8 | | | +| | | | | 32 | 16 | 16 | | | | 5.01 | 25.4 | | | +| (B) | | | | | 16 | | | | | 5.16 | 25.1 | 58 | | +| | | | | | 32 | | | | | 5.01 | 25.4 | 60 | | +| | 2 | | | | | | | | | 6.11 | 23.7 | 36 | | +| | 4 | | | | | | | | | 5.19 | 25.3 | 50 | | +| | 8 | | | | | | | | | 4.88 | 25.5 | 80 | | +| (C) | | 256 | | | 32 | 32 | | | | 5.75 | 24.5 | 28 | | +| | | 1024 | | | 128 | 128 | | | | 4.66 | 26.0 | 168 | | +| | | | 1024 | | | | | | | 5.12 | 25.4 | 53 | | +| | | | 4096 | | | | | | | 4.75 | 26.2 | 90 | | +| (D) | | | | | | | 0.0 | | | 5.77 | 24.6 | | | +| | | | | | | | 0.2 | | | 4.95 | 25.5 | | | +| | | | | | | | | 0.0 | | 4.67 | 25.3 | | | +| | | | | | | | | 0.2 | | 5.47 | 25.7 | | | +| (E) | positional embedding instead of sinusoids | | | | | | 4.92 | 25.7 | | | | | | +| big | 6 | 1024 | 4096 | 16 | | | 0.3 | | 300K | 4.33 | 26.4 | 213 | | + +development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table [3.](#page-8-0) + +In Table [3](#page-8-0) rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section [3.2.2.](#page-3-2) While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads. + +In Table [3](#page-8-0) rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [\[9\]](#page-10-9), and observe nearly identical results to the base model. + +### 6.3 English Constituency Parsing + +To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [\[37\]](#page-11-11). + +We trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [\[25\]](#page-11-12), about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-confidence and BerkleyParser corpora from with approximately 17M sentences [\[37\]](#page-11-11). We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting. + +We performed only a small number of experiments to select the dropout, both attention and residual (section [5.4)](#page-6-0), learning rates and beam size on the Section 22 development set, all other parameters remained unchanged from the English-to-German base translation model. During inference, we + + + +| Parser | Training | WSJ 23 F1 | +|-------------------------------------|--------------------------|-----------| +| Vinyals & Kaiser el al. (2014) [37] | WSJ only, discriminative | 88.3 | +| Petrov et al. (2006) [29] | WSJ only, discriminative | 90.4 | +| Zhu et al. (2013) [40] | WSJ only, discriminative | 90.4 | +| Dyer et al. (2016) [8] | WSJ only, discriminative | 91.7 | +| Transformer (4 layers) | WSJ only, discriminative | 91.3 | +| Zhu et al. (2013) [40] | semi-supervised | 91.3 | +| Huang & Harper (2009) [14] | semi-supervised | 91.3 | +| McClosky et al. (2006) [26] | semi-supervised | 92.1 | +| Vinyals & Kaiser el al. (2014) [37] | semi-supervised | 92.1 | +| Transformer (4 layers) | semi-supervised | 92.7 | +| Luong et al. (2015) [23] | multi-task | 93.0 | +| Dyer et al. (2016) [8] | generative | 93.3 | + +Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ) + +increased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3 for both WSJ only and the semi-supervised setting. + +Our results in Table [4](#page-9-4) show that despite the lack of task-specific tuning our model performs surprisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [\[8\]](#page-10-17). + +In contrast to RNN sequence-to-sequence models [\[37\]](#page-11-11), the Transformer outperforms the Berkeley-Parser [\[29\]](#page-11-13) even when training only on the WSJ training set of 40K sentences. + +### 7 Conclusion + +In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. + +For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles. + +We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours. + +The code we used to train and evaluate our models is available at [https://github.com/](https://github.com/tensorflow/tensor2tensor) [tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor). + +Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration. + +### References + +- [1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. *arXiv preprint [arXiv:1607.06450](http://arxiv.org/abs/1607.06450)*, 2016. +- [2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. *CoRR*, abs/1409.0473, 2014. +- [3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. *CoRR*, abs/1703.03906, 2017. +- [4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine reading. *arXiv preprint [arXiv:1601.06733](http://arxiv.org/abs/1601.06733)*, 2016. +- [5] Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical machine translation. *CoRR*, abs/1406.1078, 2014. +- [6] Francois Chollet. Xception: Deep learning with depthwise separable convolutions. *arXiv preprint [arXiv:1610.02357](http://arxiv.org/abs/1610.02357)*, 2016. +- [7] Junyoung Chung, Çaglar GĂŒlçehre, Kyunghyun Cho, and Yoshua Bengio. Empirical evaluation of gated recurrent neural networks on sequence modeling. *CoRR*, abs/1412.3555, 2014. +- [8] Chris Dyer, Adhiguna Kuncoro, Miguel Ballesteros, and Noah A. Smith. Recurrent neural network grammars. In *Proc. of NAACL*, 2016. +- [9] Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N. Dauphin. Convolutional sequence to sequence learning. *arXiv preprint [arXiv:1705.03122v](http://arxiv.org/abs/1705.03122)2*, 2017. +- [10] Alex Graves. Generating sequences with recurrent neural networks. *arXiv preprint [arXiv:1308.0850](http://arxiv.org/abs/1308.0850)*, 2013. +- [11] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for image recognition. In *Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition*, pages 770–778, 2016. +- [12] Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and JĂŒrgen Schmidhuber. Gradient flow in recurrent nets: the difficulty of learning long-term dependencies, 2001. +- [13] Sepp Hochreiter and JĂŒrgen Schmidhuber. Long short-term memory. *Neural computation*, 9(8):1735–1780, 1997. +- [14] Zhongqiang Huang and Mary Harper. Self-training PCFG grammars with latent annotations across languages. In *Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing*, pages 832–841. ACL, August 2009. +- [15] Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. Exploring the limits of language modeling. *arXiv preprint [arXiv:1602.02410](http://arxiv.org/abs/1602.02410)*, 2016. +- [16] Ɓukasz Kaiser and Samy Bengio. Can active memory replace attention? In *Advances in Neural Information Processing Systems, (NIPS)*, 2016. +- [17] Ɓukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms. In *International Conference on Learning Representations (ICLR)*, 2016. +- [18] Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Koray Kavukcuoglu. Neural machine translation in linear time. *arXiv preprint [arXiv:1610.10099v](http://arxiv.org/abs/1610.10099)2*, 2017. +- [19] Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In *International Conference on Learning Representations*, 2017. +- [20] Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In *ICLR*, 2015. +- [21] Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. *arXiv preprint [arXiv:1703.10722](http://arxiv.org/abs/1703.10722)*, 2017. +- [22] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. *arXiv preprint [arXiv:1703.03130](http://arxiv.org/abs/1703.03130)*, 2017. +- [23] Minh-Thang Luong, Quoc V. Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task sequence to sequence learning. *arXiv preprint [arXiv:1511.06114](http://arxiv.org/abs/1511.06114)*, 2015. +- [24] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attentionbased neural machine translation. *arXiv preprint [arXiv:1508.04025](http://arxiv.org/abs/1508.04025)*, 2015. +- [25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated corpus of english: The penn treebank. *Computational linguistics*, 19(2):313–330, 1993. +- [26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In *Proceedings of the Human Language Technology Conference of the NAACL, Main Conference*, pages 152–159. ACL, June 2006. +- [27] Ankur Parikh, Oscar TĂ€ckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention model. In *Empirical Methods in Natural Language Processing*, 2016. +- [28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive summarization. *arXiv preprint [arXiv:1705.04304](http://arxiv.org/abs/1705.04304)*, 2017. +- [29] Slav Petrov, Leon Barrett, Romain Thibaux, and Dan Klein. Learning accurate, compact, and interpretable tree annotation. In *Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the ACL*, pages 433–440. ACL, July 2006. +- [30] Ofir Press and Lior Wolf. Using the output embedding to improve language models. *arXiv preprint [arXiv:1608.05859](http://arxiv.org/abs/1608.05859)*, 2016. +- [31] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words with subword units. *arXiv preprint [arXiv:1508.07909](http://arxiv.org/abs/1508.07909)*, 2015. +- [32] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. *arXiv preprint [arXiv:1701.06538](http://arxiv.org/abs/1701.06538)*, 2017. +- [33] Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. Dropout: a simple way to prevent neural networks from overfitting. *Journal of Machine Learning Research*, 15(1):1929–1958, 2014. +- [34] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, *Advances in Neural Information Processing Systems 28*, pages 2440–2448. Curran Associates, Inc., 2015. +- [35] Ilya Sutskever, Oriol Vinyals, and Quoc VV Le. Sequence to sequence learning with neural networks. In *Advances in Neural Information Processing Systems*, pages 3104–3112, 2014. +- [36] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. Rethinking the inception architecture for computer vision. *CoRR*, abs/1512.00567, 2015. +- [37] Vinyals & Kaiser, Koo, Petrov, Sutskever, and Hinton. Grammar as a foreign language. In *Advances in Neural Information Processing Systems*, 2015. +- [38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google's neural machine translation system: Bridging the gap between human and machine translation. *arXiv preprint [arXiv:1609.08144](http://arxiv.org/abs/1609.08144)*, 2016. +- [39] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. *CoRR*, abs/1606.04199, 2016. +- [40] Muhua Zhu, Yue Zhang, Wenliang Chen, Min Zhang, and Jingbo Zhu. Fast and accurate shift-reduce constituent parsing. In *Proceedings of the 51st Annual Meeting of the ACL (Volume 1: Long Papers)*, pages 434–443. ACL, August 2013. + +#### Attention Visualizations **Input-Input Layer5** + +Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color. + +**Input-Input Layer5** + +Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top: Full attentions for head 5. Bottom: Isolated attentions from just the word 'its' for attention heads 5 and 6. Note that the attentions are very sharp for this word. + +**Input-Input Layer5** + +Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks. diff --git a/examples/custom_output_files/files/1810.04805v2.md b/examples/custom_output_files/files/1810.04805v2.md new file mode 100644 index 00000000..112540fa --- /dev/null +++ b/examples/custom_output_files/files/1810.04805v2.md @@ -0,0 +1,530 @@ +# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding + +Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova + +Google AI Language + +{jacobdevlin,mingweichang,kentonl,kristout}@google.com + +### Abstract + +We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models [(Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Rad](#page-10-1)[ford et al.,](#page-10-1) [2018)](#page-10-1), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. + +BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). + +### 1 Introduction + +Language model pre-training has been shown to be effective for improving many natural language processing tasks [(Dai and Le,](#page-9-0) [2015;](#page-9-0) [Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Radford et al.,](#page-10-1) [2018;](#page-10-1) [Howard and Ruder,](#page-9-1) [2018)](#page-9-1). These include sentence-level tasks such as natural language inference [(Bowman et al.,](#page-9-2) [2015;](#page-9-2) [Williams et al.,](#page-11-0) [2018)](#page-11-0) and paraphrasing [(Dolan](#page-9-3) [and Brockett,](#page-9-3) [2005)](#page-9-3), which aim to predict the relationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce fine-grained output at the token level [(Tjong Kim Sang and](#page-10-2) [De Meulder,](#page-10-2) [2003;](#page-10-2) [Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3). + +There are two existing strategies for applying pre-trained language representations to downstream tasks: *feature-based* and *fine-tuning*. The feature-based approach, such as ELMo [(Peters](#page-10-0) [et al.,](#page-10-0) [2018a)](#page-10-0), uses task-specific architectures that include the pre-trained representations as additional features. The fine-tuning approach, such as the Generative Pre-trained Transformer (OpenAI GPT) [(Radford et al.,](#page-10-1) [2018)](#page-10-1), introduces minimal task-specific parameters, and is trained on the downstream tasks by simply fine-tuning *all* pretrained parameters. The two approaches share the same objective function during pre-training, where they use unidirectional language models to learn general language representations. + +We argue that current techniques restrict the power of the pre-trained representations, especially for the fine-tuning approaches. The major limitation is that standard language models are unidirectional, and this limits the choice of architectures that can be used during pre-training. For example, in OpenAI GPT, the authors use a left-toright architecture, where every token can only attend to previous tokens in the self-attention layers of the Transformer [(Vaswani et al.,](#page-10-4) [2017)](#page-10-4). Such restrictions are sub-optimal for sentence-level tasks, and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering, where it is crucial to incorporate context from both directions. + +In this paper, we improve the fine-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers. BERT alleviates the previously mentioned unidirectionality constraint by using a "masked language model" (MLM) pre-training objective, inspired by the Cloze task [(Taylor,](#page-10-5) [1953)](#page-10-5). The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-toright language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pretrain a deep bidirectional Transformer. In addition to the masked language model, we also use a "next sentence prediction" task that jointly pretrains text-pair representations. The contributions of our paper are as follows: + +- We demonstrate the importance of bidirectional pre-training for language representations. Unlike [Radford et al.](#page-10-1) [(2018)](#page-10-1), which uses unidirectional language models for pre-training, BERT uses masked language models to enable pretrained deep bidirectional representations. This is also in contrast to [Peters et al.](#page-10-0) [(2018a)](#page-10-0), which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs. +- We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures. BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level *and* token-level tasks, outperforming many task-specific architectures. +- BERT advances the state of the art for eleven NLP tasks. The code and pre-trained models are available at [https://github.com/](https://github.com/google-research/bert) [google-research/bert](https://github.com/google-research/bert). + +### 2 Related Work + +There is a long history of pre-training general language representations, and we briefly review the most widely-used approaches in this section. + +#### 2.1 Unsupervised Feature-based Approaches + +Learning widely applicable representations of words has been an active area of research for decades, including non-neural [(Brown et al.,](#page-9-4) [1992;](#page-9-4) [Ando and Zhang,](#page-9-5) [2005;](#page-9-5) [Blitzer et al.,](#page-9-6) [2006)](#page-9-6) and neural [(Mikolov et al.,](#page-10-6) [2013;](#page-10-6) [Pennington et al.,](#page-10-7) [2014)](#page-10-7) methods. Pre-trained word embeddings are an integral part of modern NLP systems, offering significant improvements over embeddings learned from scratch [(Turian et al.,](#page-10-8) [2010)](#page-10-8). To pretrain word embedding vectors, left-to-right language modeling objectives have been used [(Mnih](#page-10-9) [and Hinton,](#page-10-9) [2009)](#page-10-9), as well as objectives to discriminate correct from incorrect words in left and right context [(Mikolov et al.,](#page-10-6) [2013)](#page-10-6). + +These approaches have been generalized to coarser granularities, such as sentence embeddings [(Kiros et al.,](#page-10-10) [2015;](#page-10-10) [Logeswaran and Lee,](#page-10-11) [2018)](#page-10-11) or paragraph embeddings [(Le and Mikolov,](#page-10-12) [2014)](#page-10-12). To train sentence representations, prior work has used objectives to rank candidate next sentences [(Jernite et al.,](#page-9-7) [2017;](#page-9-7) [Logeswaran and](#page-10-11) [Lee,](#page-10-11) [2018)](#page-10-11), left-to-right generation of next sentence words given a representation of the previous sentence [(Kiros et al.,](#page-10-10) [2015)](#page-10-10), or denoising autoencoder derived objectives [(Hill et al.,](#page-9-8) [2016)](#page-9-8). + +ELMo and its predecessor [(Peters et al.,](#page-10-13) [2017,](#page-10-13) [2018a)](#page-10-0) generalize traditional word embedding research along a different dimension. They extract *context-sensitive* features from a left-to-right and a right-to-left language model. The contextual representation of each token is the concatenation of the left-to-right and right-to-left representations. When integrating contextual word embeddings with existing task-specific architectures, ELMo advances the state of the art for several major NLP benchmarks [(Peters et al.,](#page-10-0) [2018a)](#page-10-0) including question answering [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3), sentiment analysis [(Socher et al.,](#page-10-14) [2013)](#page-10-14), and named entity recognition [(Tjong Kim Sang and De Meulder,](#page-10-2) [2003)](#page-10-2). [Melamud et al.](#page-10-15) [(2016)](#page-10-15) proposed learning contextual representations through a task to predict a single word from both left and right context using LSTMs. Similar to ELMo, their model is feature-based and not deeply bidirectional. [Fedus](#page-9-9) [et al.](#page-9-9) [(2018)](#page-9-9) shows that the cloze task can be used to improve the robustness of text generation models. + +#### 2.2 Unsupervised Fine-tuning Approaches + +As with the feature-based approaches, the first works in this direction only pre-trained word embedding parameters from unlabeled text [(Col](#page-9-10)[lobert and Weston,](#page-9-10) [2008)](#page-9-10). + +More recently, sentence or document encoders which produce contextual token representations have been pre-trained from unlabeled text and fine-tuned for a supervised downstream task [(Dai](#page-9-0) [and Le,](#page-9-0) [2015;](#page-9-0) [Howard and Ruder,](#page-9-1) [2018;](#page-9-1) [Radford](#page-10-1) [et al.,](#page-10-1) [2018)](#page-10-1). The advantage of these approaches is that few parameters need to be learned from scratch. At least partly due to this advantage, OpenAI GPT [(Radford et al.,](#page-10-1) [2018)](#page-10-1) achieved previously state-of-the-art results on many sentencelevel tasks from the GLUE benchmark [(Wang](#page-10-16) [et al.,](#page-10-16) [2018a)](#page-10-16). Left-to-right language model- + +Figure 1: Overall pre-training and fine-tuning procedures for BERT. Apart from output layers, the same architectures are used in both pre-training and fine-tuning. The same pre-trained model parameters are used to initialize models for different down-stream tasks. During fine-tuning, all parameters are fine-tuned. [CLS] is a special symbol added in front of every input example, and [SEP] is a special separator token (e.g. separating questions/answers). + +ing and auto-encoder objectives have been used for pre-training such models [(Howard and Ruder,](#page-9-1) [2018;](#page-9-1) [Radford et al.,](#page-10-1) [2018;](#page-10-1) [Dai and Le,](#page-9-0) [2015)](#page-9-0). + +#### 2.3 Transfer Learning from Supervised Data + +There has also been work showing effective transfer from supervised tasks with large datasets, such as natural language inference [(Conneau et al.,](#page-9-11) [2017)](#page-9-11) and machine translation [(McCann et al.,](#page-10-17) [2017)](#page-10-17). Computer vision research has also demonstrated the importance of transfer learning from large pre-trained models, where an effective recipe is to fine-tune models pre-trained with ImageNet [(Deng et al.,](#page-9-12) [2009;](#page-9-12) [Yosinski et al.,](#page-11-1) [2014)](#page-11-1). + +### 3 BERT + +We introduce BERT and its detailed implementation in this section. There are two steps in our framework: *pre-training* and *fine-tuning*. During pre-training, the model is trained on unlabeled data over different pre-training tasks. For finetuning, the BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. Each downstream task has separate fine-tuned models, even though they are initialized with the same pre-trained parameters. The question-answering example in Figure [1](#page-2-0) will serve as a running example for this section. + +A distinctive feature of BERT is its unified architecture across different tasks. There is minimal difference between the pre-trained architecture and the final downstream architecture. + +Model Architecture BERT's model architecture is a multi-layer bidirectional Transformer encoder based on the original implementation described in [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) and released in the tensor2tensor library.[1](#page-2-1) Because the use of Transformers has become common and our implementation is almost identical to the original, we will omit an exhaustive background description of the model architecture and refer readers to [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) as well as excellent guides such as "The Annotated Transformer."[2](#page-2-2) + +In this work, we denote the number of layers (i.e., Transformer blocks) as L, the hidden size as H, and the number of self-attention heads as A. [3](#page-2-3) We primarily report results on two model sizes: BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M). + +BERTBASE was chosen to have the same model size as OpenAI GPT for comparison purposes. Critically, however, the BERT Transformer uses bidirectional self-attention, while the GPT Transformer uses constrained self-attention where every token can only attend to context to its left.[4](#page-2-4) + +1 https://github.com/tensorflow/tensor2tensor 2 http://nlp.seas.harvard.edu/2018/04/03/attention.html 3 In all cases we set the feed-forward/filter size to be 4H, + +i.e., 3072 for the H = 768 and 4096 for the H = 1024. 4We note that in the literature the bidirectional Trans- + +Input/Output Representations To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a "sentence" can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A "sequence" refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. + +We use WordPiece embeddings [(Wu et al.,](#page-11-2) [2016)](#page-11-2) with a 30,000 token vocabulary. The first token of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence. We differentiate the sentences in two ways. First, we separate them with a special token ([SEP]). Second, we add a learned embedding to every token indicating whether it belongs to sentence A or sentence B. As shown in Figure [1,](#page-2-0) we denote input embedding as E, the final hidden vector of the special [CLS] token as C ∈ R H, and the final hidden vector for the i th input token as Ti ∈ R H. + +For a given token, its input representation is constructed by summing the corresponding token, segment, and position embeddings. A visualization of this construction can be seen in Figure [2.](#page-4-0) + +### 3.1 Pre-training BERT + +Unlike [Peters et al.](#page-10-0) [(2018a)](#page-10-0) and [Radford et al.](#page-10-1) [(2018)](#page-10-1), we do not use traditional left-to-right or right-to-left language models to pre-train BERT. Instead, we pre-train BERT using two unsupervised tasks, described in this section. This step is presented in the left part of Figure [1.](#page-2-0) + +Task #1: Masked LM Intuitively, it is reasonable to believe that a deep bidirectional model is strictly more powerful than either a left-to-right model or the shallow concatenation of a left-toright and a right-to-left model. Unfortunately, standard conditional language models can only be trained left-to-right *or* right-to-left, since bidirectional conditioning would allow each word to indirectly "see itself", and the model could trivially predict the target word in a multi-layered context. + +In order to train a deep bidirectional representation, we simply mask some percentage of the input tokens at random, and then predict those masked tokens. We refer to this procedure as a "masked LM" (MLM), although it is often referred to as a *Cloze* task in the literature [(Taylor,](#page-10-5) [1953)](#page-10-5). In this case, the final hidden vectors corresponding to the mask tokens are fed into an output softmax over the vocabulary, as in a standard LM. In all of our experiments, we mask 15% of all WordPiece tokens in each sequence at random. In contrast to denoising auto-encoders [(Vincent et al.,](#page-10-18) [2008)](#page-10-18), we only predict the masked words rather than reconstructing the entire input. + +Although this allows us to obtain a bidirectional pre-trained model, a downside is that we are creating a mismatch between pre-training and fine-tuning, since the [MASK] token does not appear during fine-tuning. To mitigate this, we do not always replace "masked" words with the actual [MASK] token. The training data generator chooses 15% of the token positions at random for prediction. If the i-th token is chosen, we replace the i-th token with (1) the [MASK] token 80% of the time (2) a random token 10% of the time (3) the unchanged i-th token 10% of the time. Then, Ti will be used to predict the original token with cross entropy loss. We compare variations of this procedure in Appendix [C.2.](#page-15-0) + +Task #2: Next Sentence Prediction (NSP) Many important downstream tasks such as Question Answering (QA) and Natural Language Inference (NLI) are based on understanding the *relationship* between two sentences, which is not directly captured by language modeling. In order to train a model that understands sentence relationships, we pre-train for a binarized *next sentence prediction* task that can be trivially generated from any monolingual corpus. Specifically, when choosing the sentences A and B for each pretraining example, 50% of the time B is the actual next sentence that follows A (labeled as IsNext), and 50% of the time it is a random sentence from the corpus (labeled as NotNext). As we show in Figure [1,](#page-2-0) C is used for next sentence prediction (NSP).[5](#page-3-0) Despite its simplicity, we demonstrate in Section [5.1](#page-7-0) that pre-training towards this task is very beneficial to both QA and NLI. [6](#page-3-1) + +former is often referred to as a "Transformer encoder" while the left-context-only version is referred to as a "Transformer decoder" since it can be used for text generation. + +5The final model achieves 97%-98% accuracy on NSP. + +6The vector C is not a meaningful sentence representation without fine-tuning, since it was trained with NSP. + +Figure 2: BERT input representation. The input embeddings are the sum of the token embeddings, the segmentation embeddings and the position embeddings. + +The NSP task is closely related to representationlearning objectives used in [Jernite et al.](#page-9-7) [(2017)](#page-9-7) and [Logeswaran and Lee](#page-10-11) [(2018)](#page-10-11). However, in prior work, only sentence embeddings are transferred to down-stream tasks, where BERT transfers all parameters to initialize end-task model parameters. + +Pre-training data The pre-training procedure largely follows the existing literature on language model pre-training. For the pre-training corpus we use the BooksCorpus (800M words) [(Zhu et al.,](#page-11-3) [2015)](#page-11-3) and English Wikipedia (2,500M words). For Wikipedia we extract only the text passages and ignore lists, tables, and headers. It is critical to use a document-level corpus rather than a shuffled sentence-level corpus such as the Billion Word Benchmark [(Chelba et al.,](#page-9-13) [2013)](#page-9-13) in order to extract long contiguous sequences. + +#### 3.2 Fine-tuning BERT + +Fine-tuning is straightforward since the selfattention mechanism in the Transformer allows BERT to model many downstream tasks whether they involve single text or text pairs—by swapping out the appropriate inputs and outputs. For applications involving text pairs, a common pattern is to independently encode text pairs before applying bidirectional cross attention, such as [Parikh et al.](#page-10-19) [(2016)](#page-10-19); [Seo et al.](#page-10-20) [(2017)](#page-10-20). BERT instead uses the self-attention mechanism to unify these two stages, as encoding a concatenated text pair with self-attention effectively includes *bidirectional* cross attention between two sentences. + +For each task, we simply plug in the taskspecific inputs and outputs into BERT and finetune all the parameters end-to-end. At the input, sentence A and sentence B from pre-training are analogous to (1) sentence pairs in paraphrasing, (2) hypothesis-premise pairs in entailment, (3) question-passage pairs in question answering, and (4) a degenerate text-∅ pair in text classification or sequence tagging. At the output, the token representations are fed into an output layer for tokenlevel tasks, such as sequence tagging or question answering, and the [CLS] representation is fed into an output layer for classification, such as entailment or sentiment analysis. + +Compared to pre-training, fine-tuning is relatively inexpensive. All of the results in the paper can be replicated in at most 1 hour on a single Cloud TPU, or a few hours on a GPU, starting from the exact same pre-trained model.[7](#page-4-1) We describe the task-specific details in the corresponding subsections of Section [4.](#page-4-2) More details can be found in Appendix [A.5.](#page-13-0) + +### 4 Experiments + +In this section, we present BERT fine-tuning results on 11 NLP tasks. + +#### 4.1 GLUE + +The General Language Understanding Evaluation (GLUE) benchmark [(Wang et al.,](#page-10-16) [2018a)](#page-10-16) is a collection of diverse natural language understanding tasks. Detailed descriptions of GLUE datasets are included in Appendix [B.1.](#page-13-1) + +To fine-tune on GLUE, we represent the input sequence (for single sentence or sentence pairs) as described in Section [3,](#page-2-5) and use the final hidden vector C ∈ R H corresponding to the first input token ([CLS]) as the aggregate representation. The only new parameters introduced during fine-tuning are classification layer weights W ∈ R K×H, where K is the number of labels. We compute a standard classification loss with C and W, i.e., log(softmax(CWT )). + +- 8 See (10) in . +7 For example, the BERT SQuAD model can be trained in around 30 minutes on a single Cloud TPU to achieve a Dev F1 score of 91.0%. + + + +| System | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | +|------------------|-------------|------|------|-------|------|-------|------|------|---------| +| | 392k | 363k | 108k | 67k | 8.5k | 5.7k | 3.5k | 2.5k | - | +| Pre-OpenAI SOTA | 80.6/80.1 | 66.1 | 82.3 | 93.2 | 35.0 | 81.0 | 86.0 | 61.7 | 74.0 | +| BiLSTM+ELMo+Attn | 76.4/76.1 | 64.8 | 79.8 | 90.4 | 36.0 | 73.3 | 84.9 | 56.8 | 71.0 | +| OpenAI GPT | 82.1/81.4 | 70.3 | 87.4 | 91.3 | 45.4 | 80.0 | 82.3 | 56.0 | 75.1 | +| BERTBASE | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | +| BERTLARGE | 86.7/85.9 | 72.1 | 92.7 | 94.9 | 60.5 | 86.5 | 89.3 | 70.1 | 82.1 | + +Table 1: GLUE Test results, scored by the evaluation server (). The number below each task denotes the number of training examples. The "Average" column is slightly different than the official GLUE score, since we exclude the problematic WNLI set.[8](#page-4-3) BERT and OpenAI GPT are singlemodel, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and accuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components. + +We use a batch size of 32 and fine-tune for 3 epochs over the data for all GLUE tasks. For each task, we selected the best fine-tuning learning rate (among 5e-5, 4e-5, 3e-5, and 2e-5) on the Dev set. Additionally, for BERTLARGE we found that finetuning was sometimes unstable on small datasets, so we ran several random restarts and selected the best model on the Dev set. With random restarts, we use the same pre-trained checkpoint but perform different fine-tuning data shuffling and classifier layer initialization.[9](#page-5-0) + +Results are presented in Table [1.](#page-5-1) Both BERTBASE and BERTLARGE outperform all systems on all tasks by a substantial margin, obtaining 4.5% and 7.0% respective average accuracy improvement over the prior state of the art. Note that BERTBASE and OpenAI GPT are nearly identical in terms of model architecture apart from the attention masking. For the largest and most widely reported GLUE task, MNLI, BERT obtains a 4.6% absolute accuracy improvement. On the official GLUE leaderboard[10](#page-5-2), BERTLARGE obtains a score of 80.5, compared to OpenAI GPT, which obtains 72.8 as of the date of writing. + +We find that BERTLARGE significantly outperforms BERTBASE across all tasks, especially those with very little training data. The effect of model size is explored more thoroughly in Section [5.2.](#page-7-1) + +#### 4.2 SQuAD v1.1 + +The Stanford Question Answering Dataset (SQuAD v1.1) is a collection of 100k crowdsourced question/answer pairs [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3). Given a question and a passage from Wikipedia containing the answer, the task is to predict the answer text span in the passage. + +As shown in Figure [1,](#page-2-0) in the question answering task, we represent the input question and passage as a single packed sequence, with the question using the A embedding and the passage using the B embedding. We only introduce a start vector S ∈ R H and an end vector E ∈ R H during fine-tuning. The probability of word i being the start of the answer span is computed as a dot product between Ti and S followed by a softmax over all of the words in the paragraph: Pi = e S·Ti P j e S·Tj . The analogous formula is used for the end of the answer span. The score of a candidate span from position i to position j is defined as S·Ti + E·Tj , and the maximum scoring span where j ≄ i is used as a prediction. The training objective is the sum of the log-likelihoods of the correct start and end positions. We fine-tune for 3 epochs with a learning rate of 5e-5 and a batch size of 32. + +Table [2](#page-6-0) shows top leaderboard entries as well as results from top published systems [(Seo et al.,](#page-10-20) [2017;](#page-10-20) [Clark and Gardner,](#page-9-14) [2018;](#page-9-14) [Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Hu et al.,](#page-9-15) [2018)](#page-9-15). The top results from the SQuAD leaderboard do not have up-to-date public system descriptions available,[11](#page-5-3) and are allowed to use any public data when training their systems. We therefore use modest data augmentation in our system by first fine-tuning on TriviaQA [(Joshi](#page-10-21) [et al.,](#page-10-21) [2017)](#page-10-21) befor fine-tuning on SQuAD. + +Our best performing system outperforms the top leaderboard system by +1.5 F1 in ensembling and +1.3 F1 as a single system. In fact, our single BERT model outperforms the top ensemble system in terms of F1 score. Without TriviaQA fine- + +9The GLUE data set distribution does not include the Test labels, and we only made a single GLUE evaluation server submission for each of BERTBASE and BERTLARGE. + +10https://gluebenchmark.com/leaderboard + +11QANet is described in [Yu et al.](#page-11-4) [(2018)](#page-11-4), but the system has improved substantially after publication. + + + +| System | Dev | | Test | | | | | | +|------------------------------------------|------|------|------|------|--|--|--|--| +| | EM | F1 | EM | F1 | | | | | +| Top Leaderboard Systems (Dec 10th, 2018) | | | | | | | | | +| Human | - | - | 82.3 | 91.2 | | | | | +| #1 Ensemble - nlnet | - | - | 86.0 | 91.7 | | | | | +| #2 Ensemble - QANet | - | - | 84.5 | 90.5 | | | | | +| Published | | | | | | | | | +| BiDAF+ELMo (Single) | - | 85.6 | - | 85.8 | | | | | +| R.M. Reader (Ensemble) | 81.2 | 87.9 | 82.3 | 88.5 | | | | | +| Ours | | | | | | | | | +| BERTBASE (Single) | 80.8 | 88.5 | - | - | | | | | +| BERTLARGE (Single) | 84.1 | 90.9 | - | - | | | | | +| BERTLARGE (Ensemble) | 85.8 | 91.8 | - | - | | | | | +| BERTLARGE (Sgl.+TriviaQA) | 84.2 | 91.1 | 85.1 | 91.8 | | | | | +| BERTLARGE (Ens.+TriviaQA) | 86.2 | 92.2 | 87.4 | 93.2 | | | | | + +Table 2: SQuAD 1.1 results. The BERT ensemble is 7x systems which use different pre-training checkpoints and fine-tuning seeds. + + + +| System | Dev | | Test | | | | | | +|------------------------------------------|------|------|------|------|--|--|--|--| +| | EM | F1 | EM | F1 | | | | | +| Top Leaderboard Systems (Dec 10th, 2018) | | | | | | | | | +| Human | 86.3 | 89.0 | 86.9 | 89.5 | | | | | +| #1 Single - MIR-MRC (F-Net) | - | - | 74.8 | 78.0 | | | | | +| #2 Single - nlnet | - | - | 74.2 | 77.1 | | | | | +| Published | | | | | | | | | +| unet (Ensemble) | - | - | 71.4 | 74.9 | | | | | +| SLQA+ (Single) | - | | 71.4 | 74.4 | | | | | +| Ours | | | | | | | | | +| BERTLARGE (Single) | 78.7 | 81.9 | 80.0 | 83.1 | | | | | + +Table 3: SQuAD 2.0 results. We exclude entries that use BERT as one of their components. + +tuning data, we only lose 0.1-0.4 F1, still outperforming all existing systems by a wide margin.[12](#page-6-1) + +#### 4.3 SQuAD v2.0 + +The SQuAD 2.0 task extends the SQuAD 1.1 problem definition by allowing for the possibility that no short answer exists in the provided paragraph, making the problem more realistic. + +We use a simple approach to extend the SQuAD v1.1 BERT model for this task. We treat questions that do not have an answer as having an answer span with start and end at the [CLS] token. The probability space for the start and end answer span positions is extended to include the position of the [CLS] token. For prediction, we compare the score of the no-answer span: snull = S·C + E·C to the score of the best non-null span + + + +| System | Dev | Test | +|------------------------|------|------| +| ESIM+GloVe | 51.9 | 52.7 | +| ESIM+ELMo | 59.1 | 59.2 | +| OpenAI GPT | - | 78.0 | +| BERTBASE | 81.6 | - | +| BERTLARGE | 86.6 | 86.3 | +| Human (expert)† | - | 85.0 | +| Human (5 annotations)† | - | 88.0 | + +Table 4: SWAG Dev and Test accuracies. †Human performance is measured with 100 samples, as reported in the SWAG paper. + +sˆi,j = maxj≄iS·Ti + E·Tj . We predict a non-null answer when sˆi,j > snull + τ , where the threshold τ is selected on the dev set to maximize F1. We did not use TriviaQA data for this model. We fine-tuned for 2 epochs with a learning rate of 5e-5 and a batch size of 48. + +The results compared to prior leaderboard entries and top published work [(Sun et al.,](#page-10-22) [2018;](#page-10-22) [Wang et al.,](#page-11-5) [2018b)](#page-11-5) are shown in Table [3,](#page-6-2) excluding systems that use BERT as one of their components. We observe a +5.1 F1 improvement over the previous best system. + +#### 4.4 SWAG + +The Situations With Adversarial Generations (SWAG) dataset contains 113k sentence-pair completion examples that evaluate grounded commonsense inference [(Zellers et al.,](#page-11-6) [2018)](#page-11-6). Given a sentence, the task is to choose the most plausible continuation among four choices. + +When fine-tuning on the SWAG dataset, we construct four input sequences, each containing the concatenation of the given sentence (sentence A) and a possible continuation (sentence B). The only task-specific parameters introduced is a vector whose dot product with the [CLS] token representation C denotes a score for each choice which is normalized with a softmax layer. + +We fine-tune the model for 3 epochs with a learning rate of 2e-5 and a batch size of 16. Results are presented in Table [4.](#page-6-3) BERTLARGE outperforms the authors' baseline ESIM+ELMo system by +27.1% and OpenAI GPT by 8.3%. + +### 5 Ablation Studies + +In this section, we perform ablation experiments over a number of facets of BERT in order to better understand their relative importance. Additional + +12The TriviaQA data we used consists of paragraphs from TriviaQA-Wiki formed of the first 400 tokens in documents, that contain at least one of the provided possible answers. + + + +| | Dev Set | | | | | | | +|--------------|---------|-------|-------|-------|-------|--|--| +| Tasks | MNLI-m | QNLI | MRPC | SST-2 | SQuAD | | | +| | (Acc) | (Acc) | (Acc) | (Acc) | (F1) | | | +| BERTBASE | 84.4 | 88.4 | 86.7 | 92.7 | 88.5 | | | +| No NSP | 83.9 | 84.9 | 86.5 | 92.6 | 87.9 | | | +| LTR & No NSP | 82.1 | 84.3 | 77.5 | 92.1 | 77.8 | | | +| + BiLSTM | 82.1 | 84.1 | 75.7 | 91.6 | 84.9 | | | + +Table 5: Ablation over the pre-training tasks using the BERTBASE architecture. "No NSP" is trained without the next sentence prediction task. "LTR & No NSP" is trained as a left-to-right LM without the next sentence prediction, like OpenAI GPT. "+ BiLSTM" adds a randomly initialized BiLSTM on top of the "LTR + No NSP" model during fine-tuning. + +ablation studies can be found in Appendix [C.](#page-15-1) + +### 5.1 Effect of Pre-training Tasks + +We demonstrate the importance of the deep bidirectionality of BERT by evaluating two pretraining objectives using exactly the same pretraining data, fine-tuning scheme, and hyperparameters as BERTBASE: + +No NSP: A bidirectional model which is trained using the "masked LM" (MLM) but without the "next sentence prediction" (NSP) task. + +LTR & No NSP: A left-context-only model which is trained using a standard Left-to-Right (LTR) LM, rather than an MLM. The left-only constraint was also applied at fine-tuning, because removing it introduced a pre-train/fine-tune mismatch that degraded downstream performance. Additionally, this model was pre-trained without the NSP task. This is directly comparable to OpenAI GPT, but using our larger training dataset, our input representation, and our fine-tuning scheme. + +We first examine the impact brought by the NSP task. In Table [5,](#page-7-2) we show that removing NSP hurts performance significantly on QNLI, MNLI, and SQuAD 1.1. Next, we evaluate the impact of training bidirectional representations by comparing "No NSP" to "LTR & No NSP". The LTR model performs worse than the MLM model on all tasks, with large drops on MRPC and SQuAD. + +For SQuAD it is intuitively clear that a LTR model will perform poorly at token predictions, since the token-level hidden states have no rightside context. In order to make a good faith attempt at strengthening the LTR system, we added a randomly initialized BiLSTM on top. This does significantly improve results on SQuAD, but the results are still far worse than those of the pretrained bidirectional models. The BiLSTM hurts performance on the GLUE tasks. + +We recognize that it would also be possible to train separate LTR and RTL models and represent each token as the concatenation of the two models, as ELMo does. However: (a) this is twice as expensive as a single bidirectional model; (b) this is non-intuitive for tasks like QA, since the RTL model would not be able to condition the answer on the question; (c) this it is strictly less powerful than a deep bidirectional model, since it can use both left and right context at every layer. + +### 5.2 Effect of Model Size + +In this section, we explore the effect of model size on fine-tuning task accuracy. We trained a number of BERT models with a differing number of layers, hidden units, and attention heads, while otherwise using the same hyperparameters and training procedure as described previously. + +Results on selected GLUE tasks are shown in Table [6.](#page-8-0) In this table, we report the average Dev Set accuracy from 5 random restarts of fine-tuning. We can see that larger models lead to a strict accuracy improvement across all four datasets, even for MRPC which only has 3,600 labeled training examples, and is substantially different from the pre-training tasks. It is also perhaps surprising that we are able to achieve such significant improvements on top of models which are already quite large relative to the existing literature. For example, the largest Transformer explored in [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) is (L=6, H=1024, A=16) with 100M parameters for the encoder, and the largest Transformer we have found in the literature is (L=64, H=512, A=2) with 235M parameters [(Al-Rfou et al.,](#page-9-16) [2018)](#page-9-16). By contrast, BERTBASE contains 110M parameters and BERTLARGE contains 340M parameters. + +It has long been known that increasing the model size will lead to continual improvements on large-scale tasks such as machine translation and language modeling, which is demonstrated by the LM perplexity of held-out training data shown in Table [6.](#page-8-0) However, we believe that this is the first work to demonstrate convincingly that scaling to extreme model sizes also leads to large improvements on very small scale tasks, provided that the model has been sufficiently pre-trained. [Peters et al.](#page-10-23) [(2018b)](#page-10-23) presented mixed results on the downstream task impact of increasing the pre-trained bi-LM size from two to four layers and [Melamud et al.](#page-10-15) [(2016)](#page-10-15) mentioned in passing that increasing hidden dimension size from 200 to 600 helped, but increasing further to 1,000 did not bring further improvements. Both of these prior works used a featurebased approach — we hypothesize that when the model is fine-tuned directly on the downstream tasks and uses only a very small number of randomly initialized additional parameters, the taskspecific models can benefit from the larger, more expressive pre-trained representations even when downstream task data is very small. + +#### 5.3 Feature-based Approach with BERT + +All of the BERT results presented so far have used the fine-tuning approach, where a simple classification layer is added to the pre-trained model, and all parameters are jointly fine-tuned on a downstream task. However, the feature-based approach, where fixed features are extracted from the pretrained model, has certain advantages. First, not all tasks can be easily represented by a Transformer encoder architecture, and therefore require a task-specific model architecture to be added. Second, there are major computational benefits to pre-compute an expensive representation of the training data once and then run many experiments with cheaper models on top of this representation. + +In this section, we compare the two approaches by applying BERT to the CoNLL-2003 Named Entity Recognition (NER) task [(Tjong Kim Sang](#page-10-2) [and De Meulder,](#page-10-2) [2003)](#page-10-2). In the input to BERT, we use a case-preserving WordPiece model, and we include the maximal document context provided by the data. Following standard practice, we formulate this as a tagging task but do not use a CRF + + + +| | Hyperparams | | | Dev Set Accuracy | | | | | +|-------------------------|----------------------------------|---------------------------|--------------------------------------|--------------------------------------|--------------------------------------|--------------------------------------|--|--| +| #L | #H | #A | LM (ppl) | MNLI-m | MRPC | SST-2 | | | +| 3
6
6
12
12 | 768
768
768
768
1024 | 12
3
12
12
16 | 5.84
5.24
4.68
3.99
3.54 | 77.9
80.6
81.9
84.4
85.7 | 79.8
82.2
84.8
86.7
86.9 | 88.4
90.7
91.3
92.9
93.3 | | | +| 24 | 1024 | 16 | 3.23 | 86.6 | 87.8 | 93.7 | | | + +Table 6: Ablation over BERT model size. #L = the number of layers; #H = hidden size; #A = number of attention heads. "LM (ppl)" is the masked LM perplexity of held-out training data. + + + +| System | Dev F1 | Test F1 | +|-----------------------------------|--------|---------| +| ELMo (Peters et al., 2018a) | 95.7 | 92.2 | +| CVT (Clark et al., 2018) | - | 92.6 | +| CSE (Akbik et al., 2018) | - | 93.1 | +| Fine-tuning approach | | | +| BERTLARGE | 96.6 | 92.8 | +| BERTBASE | 96.4 | 92.4 | +| Feature-based approach (BERTBASE) | | | +| Embeddings | 91.0 | - | +| Second-to-Last Hidden | 95.6 | - | +| Last Hidden | 94.9 | - | +| Weighted Sum Last Four Hidden | 95.9 | - | +| Concat Last Four Hidden | 96.1 | - | +| Weighted Sum All 12 Layers | 95.5 | - | + +Table 7: CoNLL-2003 Named Entity Recognition results. Hyperparameters were selected using the Dev set. The reported Dev and Test scores are averaged over 5 random restarts using those hyperparameters. + +layer in the output. We use the representation of the first sub-token as the input to the token-level classifier over the NER label set. + +To ablate the fine-tuning approach, we apply the feature-based approach by extracting the activations from one or more layers *without* fine-tuning any parameters of BERT. These contextual embeddings are used as input to a randomly initialized two-layer 768-dimensional BiLSTM before the classification layer. + +Results are presented in Table [7.](#page-8-1) BERTLARGE performs competitively with state-of-the-art methods. The best performing method concatenates the token representations from the top four hidden layers of the pre-trained Transformer, which is only 0.3 F1 behind fine-tuning the entire model. This demonstrates that BERT is effective for both finetuning and feature-based approaches. + +### 6 Conclusion + +Recent empirical improvements due to transfer learning with language models have demonstrated that rich, unsupervised pre-training is an integral part of many language understanding systems. In particular, these results enable even low-resource tasks to benefit from deep unidirectional architectures. Our major contribution is further generalizing these findings to deep *bidirectional* architectures, allowing the same pre-trained model to successfully tackle a broad set of NLP tasks. + +### References + +- Alan Akbik, Duncan Blythe, and Roland Vollgraf. 2018. Contextual string embeddings for sequence labeling. In *Proceedings of the 27th International Conference on Computational Linguistics*, pages 1638–1649. +- Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, and Llion Jones. 2018. Character-level language modeling with deeper self-attention. *arXiv preprint arXiv:1808.04444*. +- Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. *Journal of Machine Learning Research*, 6(Nov):1817–1853. +- Luisa Bentivogli, Bernardo Magnini, Ido Dagan, Hoa Trang Dang, and Danilo Giampiccolo. 2009. The fifth PASCAL recognizing textual entailment challenge. In *TAC*. NIST. +- John Blitzer, Ryan McDonald, and Fernando Pereira. 2006. Domain adaptation with structural correspondence learning. In *Proceedings of the 2006 conference on empirical methods in natural language processing*, pages 120–128. Association for Computational Linguistics. +- Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In *EMNLP*. Association for Computational Linguistics. +- Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. *Computational linguistics*, 18(4):467–479. +- Daniel Cer, Mona Diab, Eneko Agirre, Inigo Lopez-Gazpio, and Lucia Specia. 2017. [Semeval-2017](https://doi.org/10.18653/v1/S17-2001) [task 1: Semantic textual similarity multilingual and](https://doi.org/10.18653/v1/S17-2001) [crosslingual focused evaluation.](https://doi.org/10.18653/v1/S17-2001) In *Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)*, pages 1–14, Vancouver, Canada. Association for Computational Linguistics. +- Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, and Tony Robinson. 2013. One billion word benchmark for measuring progress in statistical language modeling. *arXiv preprint arXiv:1312.3005*. +- Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. [Quora question pairs.](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) +- Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehension. In *ACL*. +- Kevin Clark, Minh-Thang Luong, Christopher D Manning, and Quoc Le. 2018. Semi-supervised sequence modeling with cross-view training. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing*, pages 1914– 1925. +- Ronan Collobert and Jason Weston. 2008. A unified architecture for natural language processing: Deep neural networks with multitask learning. In *Proceedings of the 25th international conference on Machine learning*, pages 160–167. ACM. +- Alexis Conneau, Douwe Kiela, Holger Schwenk, Lošıc Barrault, and Antoine Bordes. 2017. [Supervised](https://www.aclweb.org/anthology/D17-1070) [learning of universal sentence representations from](https://www.aclweb.org/anthology/D17-1070) [natural language inference data.](https://www.aclweb.org/anthology/D17-1070) In *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing*, pages 670–680, Copenhagen, Denmark. Association for Computational Linguistics. +- Andrew M Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In *Advances in neural information processing systems*, pages 3079–3087. +- J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In *CVPR09*. +- William B Dolan and Chris Brockett. 2005. Automatically constructing a corpus of sentential paraphrases. In *Proceedings of the Third International Workshop on Paraphrasing (IWP2005)*. +- William Fedus, Ian Goodfellow, and Andrew M Dai. 2018. Maskgan: Better text generation via filling in the . *arXiv preprint arXiv:1801.07736*. +- Dan Hendrycks and Kevin Gimpel. 2016. [Bridging](http://arxiv.org/abs/1606.08415) [nonlinearities and stochastic regularizers with gaus](http://arxiv.org/abs/1606.08415)[sian error linear units.](http://arxiv.org/abs/1606.08415) *CoRR*, abs/1606.08415. +- Felix Hill, Kyunghyun Cho, and Anna Korhonen. 2016. Learning distributed representations of sentences from unlabelled data. In *Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies*. Association for Computational Linguistics. +- Jeremy Howard and Sebastian Ruder. 2018. [Universal](http://arxiv.org/abs/1801.06146) [language model fine-tuning for text classification.](http://arxiv.org/abs/1801.06146) In *ACL*. Association for Computational Linguistics. +- Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine reading comprehension. In *IJCAI*. +- Yacine Jernite, Samuel R. Bowman, and David Sontag. 2017. [Discourse-based objectives for fast un](http://arxiv.org/abs/1705.00557)[supervised sentence representation learning.](http://arxiv.org/abs/1705.00557) *CoRR*, abs/1705.00557. +- Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. In *ACL*. +- Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought vectors. In *Advances in neural information processing systems*, pages 3294–3302. +- Quoc Le and Tomas Mikolov. 2014. Distributed representations of sentences and documents. In *International Conference on Machine Learning*, pages 1188–1196. +- Hector J Levesque, Ernest Davis, and Leora Morgenstern. 2011. The winograd schema challenge. In *Aaai spring symposium: Logical formalizations of commonsense reasoning*, volume 46, page 47. +- Lajanugen Logeswaran and Honglak Lee. 2018. [An](https://openreview.net/forum?id=rJvJXZb0W) [efficient framework for learning sentence represen](https://openreview.net/forum?id=rJvJXZb0W)[tations.](https://openreview.net/forum?id=rJvJXZb0W) In *International Conference on Learning Representations*. +- Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. 2017. Learned in translation: Contextualized word vectors. In *NIPS*. +- Oren Melamud, Jacob Goldberger, and Ido Dagan. 2016. context2vec: Learning generic context embedding with bidirectional LSTM. In *CoNLL*. +- Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In *Advances in Neural Information Processing Systems 26*, pages 3111–3119. Curran Associates, Inc. +- Andriy Mnih and Geoffrey E Hinton. 2009. [A scal](http://papers.nips.cc/paper/3583-a-scalable-hierarchical-distributed-language-model.pdf)[able hierarchical distributed language model.](http://papers.nips.cc/paper/3583-a-scalable-hierarchical-distributed-language-model.pdf) In D. Koller, D. Schuurmans, Y. Bengio, and L. Bottou, editors, *Advances in Neural Information Processing Systems 21*, pages 1081–1088. Curran Associates, Inc. +- Ankur P Parikh, Oscar Tackstr š om, Dipanjan Das, and š Jakob Uszkoreit. 2016. A decomposable attention model for natural language inference. In *EMNLP*. +- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [Glove: Global vectors for](http://www.aclweb.org/anthology/D14-1162) [word representation.](http://www.aclweb.org/anthology/D14-1162) In *Empirical Methods in Natural Language Processing (EMNLP)*, pages 1532– 1543. +- Matthew Peters, Waleed Ammar, Chandra Bhagavatula, and Russell Power. 2017. Semi-supervised sequence tagging with bidirectional language models. In *ACL*. +- Matthew Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018a. Deep contextualized word representations. In *NAACL*. +- Matthew Peters, Mark Neumann, Luke Zettlemoyer, and Wen-tau Yih. 2018b. Dissecting contextual word embeddings: Architecture and representation. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing*, pages 1499–1509. +- Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language understanding with unsupervised learning. Technical report, OpenAI. +- Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000+ questions for machine comprehension of text. In *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, pages 2383–2392. +- Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. 2017. Bidirectional attention flow for machine comprehension. In *ICLR*. +- Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher D Manning, Andrew Ng, and Christopher Potts. 2013. Recursive deep models for semantic compositionality over a sentiment treebank. In *Proceedings of the 2013 conference on empirical methods in natural language processing*, pages 1631–1642. +- Fu Sun, Linyang Li, Xipeng Qiu, and Yang Liu. 2018. U-net: Machine reading comprehension with unanswerable questions. *arXiv preprint arXiv:1810.06638*. +- Wilson L Taylor. 1953. Cloze procedure: A new tool for measuring readability. *Journalism Bulletin*, 30(4):415–433. +- Erik F Tjong Kim Sang and Fien De Meulder. 2003. Introduction to the conll-2003 shared task: Language-independent named entity recognition. In *CoNLL*. +- Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: A simple and general method for semi-supervised learning. In *Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics*, ACL '10, pages 384–394. +- Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In *Advances in Neural Information Processing Systems*, pages 6000–6010. +- Pascal Vincent, Hugo Larochelle, Yoshua Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust features with denoising autoencoders. In *Proceedings of the 25th international conference on Machine learning*, pages 1096–1103. ACM. +- Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis platform + +for natural language understanding. In *Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP*, pages 353–355. + +- Wei Wang, Ming Yan, and Chen Wu. 2018b. Multigranularity hierarchical attention fusion networks for reading comprehension and question answering. In *Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*. Association for Computational Linguistics. +- Alex Warstadt, Amanpreet Singh, and Samuel R Bowman. 2018. Neural network acceptability judgments. *arXiv preprint arXiv:1805.12471*. +- Adina Williams, Nikita Nangia, and Samuel R Bowman. 2018. A broad-coverage challenge corpus for sentence understanding through inference. In *NAACL*. +- Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. *arXiv preprint arXiv:1609.08144*. +- Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks? In *Advances in neural information processing systems*, pages 3320–3328. +- Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. 2018. QANet: Combining local convolution with global self-attention for reading comprehension. In *ICLR*. +- Rowan Zellers, Yonatan Bisk, Roy Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset for grounded commonsense inference. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)*. +- Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In *Proceedings of the IEEE international conference on computer vision*, pages 19–27. + +# Appendix for "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" + +We organize the appendix into three sections: + +- Additional implementation details for BERT are presented in Appendix [A;](#page-11-7) +- Additional details for our experiments are presented in Appendix [B;](#page-13-2) and +- Additional ablation studies are presented in Appendix [C.](#page-15-1) + +We present additional ablation studies for BERT including: + +- Effect of Number of Training Steps; and +- Ablation for Different Masking Procedures. + +### A Additional Details for BERT + +### A.1 Illustration of the Pre-training Tasks + +We provide examples of the pre-training tasks in the following. + +Masked LM and the Masking Procedure Assuming the unlabeled sentence is my dog is hairy, and during the random masking procedure we chose the 4-th token (which corresponding to hairy), our masking procedure can be further illustrated by + +- 80% of the time: Replace the word with the [MASK] token, e.g., my dog is hairy → my dog is [MASK] +- 10% of the time: Replace the word with a random word, e.g., my dog is hairy → my dog is apple +- 10% of the time: Keep the word unchanged, e.g., my dog is hairy → my dog is hairy. The purpose of this is to bias the representation towards the actual observed word. + +The advantage of this procedure is that the Transformer encoder does not know which words it will be asked to predict or which have been replaced by random words, so it is forced to keep a distributional contextual representation of *every* input token. Additionally, because random replacement only occurs for 1.5% of all tokens (i.e., 10% of 15%), this does not seem to harm the model's language understanding capability. In Section [C.2,](#page-15-0) we evaluate the impact this procedure. + +Compared to standard langauge model training, the masked LM only make predictions on 15% of tokens in each batch, which suggests that more pre-training steps may be required for the model + +Figure 3: Differences in pre-training model architectures. BERT uses a bidirectional Transformer. OpenAI GPT uses a left-to-right Transformer. ELMo uses the concatenation of independently trained left-to-right and right-toleft LSTMs to generate features for downstream tasks. Among the three, only BERT representations are jointly conditioned on both left and right context in all layers. In addition to the architecture differences, BERT and OpenAI GPT are fine-tuning approaches, while ELMo is a feature-based approach. + +to converge. In Section [C.1](#page-15-2) we demonstrate that MLM does converge marginally slower than a leftto-right model (which predicts every token), but the empirical improvements of the MLM model far outweigh the increased training cost. + +Next Sentence Prediction The next sentence prediction task can be illustrated in the following examples. + +Input = [CLS] the man went to [MASK] store [SEP] he bought a gallon [MASK] milk [SEP] Label = IsNext + +Input = [CLS] the man [MASK] to the store [SEP] penguin [MASK] are flight ##less birds [SEP] Label = NotNext + +#### A.2 Pre-training Procedure + +To generate each training input sequence, we sample two spans of text from the corpus, which we refer to as "sentences" even though they are typically much longer than single sentences (but can be shorter also). The first sentence receives the A embedding and the second receives the B embedding. 50% of the time B is the actual next sentence that follows A and 50% of the time it is a random sentence, which is done for the "next sentence prediction" task. They are sampled such that the combined length is ≀ 512 tokens. The LM masking is applied after WordPiece tokenization with a uniform masking rate of 15%, and no special consideration given to partial word pieces. + +We train with batch size of 256 sequences (256 sequences * 512 tokens = 128,000 tokens/batch) for 1,000,000 steps, which is approximately 40 epochs over the 3.3 billion word corpus. We use Adam with learning rate of 1e-4, ÎČ1 = 0.9, ÎČ2 = 0.999, L2 weight decay of 0.01, learning rate warmup over the first 10,000 steps, and linear decay of the learning rate. We use a dropout probability of 0.1 on all layers. We use a gelu activation [(Hendrycks and Gimpel,](#page-9-19) [2016)](#page-9-19) rather than the standard relu, following OpenAI GPT. The training loss is the sum of the mean masked LM likelihood and the mean next sentence prediction likelihood. + +Training of BERTBASE was performed on 4 Cloud TPUs in Pod configuration (16 TPU chips total).[13](#page-12-0) Training of BERTLARGE was performed on 16 Cloud TPUs (64 TPU chips total). Each pretraining took 4 days to complete. + +Longer sequences are disproportionately expensive because attention is quadratic to the sequence length. To speed up pretraing in our experiments, we pre-train the model with sequence length of 128 for 90% of the steps. Then, we train the rest 10% of the steps of sequence of 512 to learn the positional embeddings. + +#### A.3 Fine-tuning Procedure + +For fine-tuning, most model hyperparameters are the same as in pre-training, with the exception of the batch size, learning rate, and number of training epochs. The dropout probability was always kept at 0.1. The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks: + +‱ Batch size: 16, 32 + +13https://cloudplatform.googleblog.com/2018/06/Cloud-TPU-now-offers-preemptible-pricing-and-globalavailability.html + +‱ Learning rate (Adam): 5e-5, 3e-5, 2e-5 + +‱ Number of epochs: 2, 3, 4 + +We also observed that large data sets (e.g., 100k+ labeled training examples) were far less sensitive to hyperparameter choice than small data sets. Fine-tuning is typically very fast, so it is reasonable to simply run an exhaustive search over the above parameters and choose the model that performs best on the development set. + +# A.4 Comparison of BERT, ELMo ,and OpenAI GPT + +Here we studies the differences in recent popular representation learning models including ELMo, OpenAI GPT and BERT. The comparisons between the model architectures are shown visually in Figure [3.](#page-12-1) Note that in addition to the architecture differences, BERT and OpenAI GPT are finetuning approaches, while ELMo is a feature-based approach. + +The most comparable existing pre-training method to BERT is OpenAI GPT, which trains a left-to-right Transformer LM on a large text corpus. In fact, many of the design decisions in BERT were intentionally made to make it as close to GPT as possible so that the two methods could be minimally compared. The core argument of this work is that the bi-directionality and the two pretraining tasks presented in Section [3.1](#page-3-2) account for the majority of the empirical improvements, but we do note that there are several other differences between how BERT and GPT were trained: + +- GPT is trained on the BooksCorpus (800M words); BERT is trained on the BooksCorpus (800M words) and Wikipedia (2,500M words). +- GPT uses a sentence separator ([SEP]) and classifier token ([CLS]) which are only introduced at fine-tuning time; BERT learns [SEP], [CLS] and sentence A/B embeddings during pre-training. +- GPT was trained for 1M steps with a batch size of 32,000 words; BERT was trained for 1M steps with a batch size of 128,000 words. +- GPT used the same learning rate of 5e-5 for all fine-tuning experiments; BERT chooses a task-specific fine-tuning learning rate which performs the best on the development set. + +To isolate the effect of these differences, we perform ablation experiments in Section [5.1](#page-7-0) which demonstrate that the majority of the improvements are in fact coming from the two pre-training tasks and the bidirectionality they enable. + +# A.5 Illustrations of Fine-tuning on Different Tasks + +The illustration of fine-tuning BERT on different tasks can be seen in Figure [4.](#page-14-0) Our task-specific models are formed by incorporating BERT with one additional output layer, so a minimal number of parameters need to be learned from scratch. Among the tasks, (a) and (b) are sequence-level tasks while (c) and (d) are token-level tasks. In the figure, E represents the input embedding, Ti represents the contextual representation of token i, [CLS] is the special symbol for classification output, and [SEP] is the special symbol to separate non-consecutive token sequences. + +# B Detailed Experimental Setup + +# B.1 Detailed Descriptions for the GLUE Benchmark Experiments. + +Our GLUE results in Tabl[e1](#page-5-1) are obtained from [https://gluebenchmark.com/](https://gluebenchmark.com/leaderboard) [leaderboard](https://gluebenchmark.com/leaderboard) and [https://blog.](https://blog.openai.com/language-unsupervised) [openai.com/language-unsupervised](https://blog.openai.com/language-unsupervised). The GLUE benchmark includes the following datasets, the descriptions of which were originally summarized in [Wang et al.](#page-10-16) [(2018a)](#page-10-16): + +MNLI Multi-Genre Natural Language Inference is a large-scale, crowdsourced entailment classification task [(Williams et al.,](#page-11-0) [2018)](#page-11-0). Given a pair of sentences, the goal is to predict whether the second sentence is an *entailment*, *contradiction*, or *neutral* with respect to the first one. + +QQP Quora Question Pairs is a binary classification task where the goal is to determine if two questions asked on Quora are semantically equivalent [(Chen et al.,](#page-9-20) [2018)](#page-9-20). + +QNLI Question Natural Language Inference is a version of the Stanford Question Answering Dataset [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3) which has been converted to a binary classification task [(Wang](#page-10-16) [et al.,](#page-10-16) [2018a)](#page-10-16). The positive examples are (question, sentence) pairs which do contain the correct answer, and the negative examples are (question, sentence) from the same paragraph which do not contain the answer. + +Figure 4: Illustrations of Fine-tuning BERT on Different Tasks. + +SST-2 The Stanford Sentiment Treebank is a binary single-sentence classification task consisting of sentences extracted from movie reviews with human annotations of their sentiment [(Socher](#page-10-14) [et al.,](#page-10-14) [2013)](#page-10-14). + +CoLA The Corpus of Linguistic Acceptability is a binary single-sentence classification task, where the goal is to predict whether an English sentence is linguistically "acceptable" or not [(Warstadt](#page-11-8) [et al.,](#page-11-8) [2018)](#page-11-8). + +STS-B The Semantic Textual Similarity Benchmark is a collection of sentence pairs drawn from news headlines and other sources [(Cer et al.,](#page-9-21) [2017)](#page-9-21). They were annotated with a score from 1 to 5 denoting how similar the two sentences are in terms of semantic meaning. + +MRPC Microsoft Research Paraphrase Corpus consists of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent [(Dolan and Brockett,](#page-9-3) [2005)](#page-9-3). + +RTE Recognizing Textual Entailment is a binary entailment task similar to MNLI, but with much less training data [(Bentivogli et al.,](#page-9-22) [2009)](#page-9-22).[14](#page-14-1) + +WNLI Winograd NLI is a small natural language inference dataset [(Levesque et al.,](#page-10-24) [2011)](#page-10-24). The GLUE webpage notes that there are issues with the construction of this dataset, [15](#page-14-2) and every trained system that's been submitted to GLUE has performed worse than the 65.1 baseline accuracy of predicting the majority class. We therefore exclude this set to be fair to OpenAI GPT. For our GLUE submission, we always predicted the ma- + +14Note that we only report single-task fine-tuning results in this paper. A multitask fine-tuning approach could potentially push the performance even further. For example, we did observe substantial improvements on RTE from multitask training with MNLI. + +15 + +jority class. + +### C Additional Ablation Studies + +#### C.1 Effect of Number of Training Steps + +Figure [5](#page-15-3) presents MNLI Dev accuracy after finetuning from a checkpoint that has been pre-trained for k steps. This allows us to answer the following questions: + +- 1. Question: Does BERT really need such a large amount of pre-training (128,000 words/batch * 1,000,000 steps) to achieve high fine-tuning accuracy? +Answer: Yes, BERTBASE achieves almost 1.0% additional accuracy on MNLI when trained on 1M steps compared to 500k steps. + +- 2. Question: Does MLM pre-training converge slower than LTR pre-training, since only 15% of words are predicted in each batch rather than every word? +Answer: The MLM model does converge slightly slower than the LTR model. However, in terms of absolute accuracy the MLM model begins to outperform the LTR model almost immediately. + +### C.2 Ablation for Different Masking Procedures + +In Section [3.1,](#page-3-2) we mention that BERT uses a mixed strategy for masking the target tokens when pre-training with the masked language model (MLM) objective. The following is an ablation study to evaluate the effect of different masking strategies. + +Figure 5: Ablation over number of training steps. This shows the MNLI accuracy after fine-tuning, starting from model parameters that have been pre-trained for k steps. The x-axis is the value of k. + +Note that the purpose of the masking strategies is to reduce the mismatch between pre-training and fine-tuning, as the [MASK] symbol never appears during the fine-tuning stage. We report the Dev results for both MNLI and NER. For NER, we report both fine-tuning and feature-based approaches, as we expect the mismatch will be amplified for the feature-based approach as the model will not have the chance to adjust the representations. + + + +| Masking Rates | | | Dev Set Results | | | | +|---------------|------|------|-------------------|-----------------------------------|------|--| +| MASK | SAME | RND | MNLI
Fine-tune | NER
Fine-tune
Feature-based | | | +| 80% | 10% | 10% | 84.2 | 95.4 | 94.9 | | +| 100% | 0% | 0% | 84.3 | 94.9 | 94.0 | | +| 80% | 0% | 20% | 84.1 | 95.2 | 94.6 | | +| 80% | 20% | 0% | 84.4 | 95.2 | 94.7 | | +| 0% | 20% | 80% | 83.7 | 94.8 | 94.6 | | +| 0% | 0% | 100% | 83.6 | 94.9 | 94.6 | | + +Table 8: Ablation over different masking strategies. + +The results are presented in Table [8.](#page-15-4) In the table, MASK means that we replace the target token with the [MASK] symbol for MLM; SAME means that we keep the target token as is; RND means that we replace the target token with another random token. + +The numbers in the left part of the table represent the probabilities of the specific strategies used during MLM pre-training (BERT uses 80%, 10%, 10%). The right part of the paper represents the Dev set results. For the feature-based approach, we concatenate the last 4 layers of BERT as the features, which was shown to be the best approach in Section [5.3.](#page-8-2) + +From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well. diff --git a/examples/custom_output_files/files/rfc8259.md b/examples/custom_output_files/files/rfc8259.md new file mode 100644 index 00000000..bf6c2941 --- /dev/null +++ b/examples/custom_output_files/files/rfc8259.md @@ -0,0 +1,362 @@ +Internet Engineering Task Force (IETF) T. Bray, Ed. Request for Comments: 8259 Textuality Obsoletes: 7159 December 2017 Category: Standards Track ISSN: 2070-1721 + +The JavaScript Object Notation (JSON) Data Interchange Format + +Abstract + + JavaScript Object Notation (JSON) is a lightweight, text-based, language-independent data interchange format. It was derived from the ECMAScript Programming Language Standard. JSON defines a small set of formatting rules for the portable representation of structured data. + + This document removes inconsistencies with other specifications of JSON, repairs specification errors, and offers experience-based interoperability guidance. + +Status of This Memo + +This is an Internet Standards Track document. + + This document is a product of the Internet Engineering Task Force (IETF). It represents the consensus of the IETF community. It has received public review and has been approved for publication by the Internet Engineering Steering Group (IESG). Further information on Internet Standards is available in Section 2 of RFC 7841. + + Information about the current status of this document, any errata, and how to provide feedback on it may be obtained at https://www.rfc-editor.org/info/rfc8259. + +Bray Standards Track [Page 1] + +Copyright Notice + + Copyright (c) 2017 IETF Trust and the persons identified as the document authors. All rights reserved. + + This document is subject to BCP 78 and the IETF Trust's Legal Provisions Relating to IETF Documents (https://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as + +described in the Simplified BSD License. + + This document may contain material from IETF Documents or IETF Contributions published or made publicly available before November 10, 2008. The person(s) controlling the copyright in some of this material may not have granted the IETF Trust the right to allow modifications of such material outside the IETF Standards Process. Without obtaining an adequate license from the person(s) controlling the copyright in such materials, this document may not be modified outside the IETF Standards Process, and derivative works of it may not be created outside the IETF Standards Process, except to format it for publication as an RFC or to translate it into languages other than English. + +Bray Standards Track [Page 2] + +Table of Contents + +| 1.1. Conventions Used in This Document
4
1.2. Specifications of JSON
4
1.3. Introduction to This Revision
5
2. JSON Grammar
5
3. Values
6
4. Objects
6
5. Arrays
7
6. Numbers
7
7. Strings
8
8. String and Character Issues
9
8.1. Character Encoding
9
8.2. Unicode Characters 10
8.3. String Comparison 10
9. Parsers 10
10. Generators 10
11. IANA Considerations 11
12. Security Considerations 12
13. Examples 12
14. References 14
14.1. Normative References 14
14.2. Informative References 14
Appendix A. Changes from RFC 7159 16
Contributors 16 | 1. Introduction | | 3 | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------|--|---| +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | +| | Author's Address 16 | | | + +## 1. Introduction + + JavaScript Object Notation (JSON) is a text format for the serialization of structured data. It is derived from the object literals of JavaScript, as defined in the ECMAScript Programming Language Standard, Third Edition [ECMA-262]. + + JSON can represent four primitive types (strings, numbers, booleans, and null) and two structured types (objects and arrays). + + A string is a sequence of zero or more Unicode characters [UNICODE]. Note that this citation references the latest version of Unicode rather than a specific release. It is not expected that future changes in the Unicode specification will impact the syntax of JSON. + + An object is an unordered collection of zero or more name/value pairs, where a name is a string and a value is a string, number, boolean, null, object, or array. + +An array is an ordered sequence of zero or more values. + +Bray Standards Track [Page 3] + + The terms "object" and "array" come from the conventions of JavaScript. + + JSON's design goals were for it to be minimal, portable, textual, and a subset of JavaScript. + +1.1. Conventions Used in This Document + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 [RFC2119] [RFC8174] when, and only when, they appear in all capitals, as shown here. + + The grammatical rules in this document are to be interpreted as described in [RFC5234]. + +1.2. Specifications of JSON + + This document replaces [RFC7159]. [RFC7159] obsoleted [RFC4627], which originally described JSON and registered the media type "application/json". + +JSON is also described in [ECMA-404]. + + The reference to ECMA-404 in the previous sentence is normative, not with the usual meaning that implementors need to consult it in order to understand this document, but to emphasize that there are no inconsistencies in the definition of the term "JSON text" in any of its specifications. Note, however, that ECMA-404 allows several practices that this specification recommends avoiding in the interests of maximal interoperability. + + The intent is that the grammar is the same between the two documents, although different descriptions are used. If there is a difference found between them, ECMA and the IETF will work together to update both documents. + + If an error is found with either document, the other should be examined to see if it has a similar error; if it does, it should be fixed, if possible. + + If either document is changed in the future, ECMA and the IETF will work together to ensure that the two documents stay aligned through the change. + +Bray Standards Track [Page 4] + +- 1.3. Introduction to This Revision + In the years since the publication of RFC 4627, JSON has found very wide use. This experience has revealed certain patterns that, while allowed by its specifications, have caused interoperability problems. + + Also, a small number of errata have been reported regarding RFC 4627 (see RFC Errata IDs 607 [Err607] and 3607 [Err3607]) and regarding RFC 7159 (see RFC Errata IDs 3915 [Err3915], 4264 [Err4264], 4336 [Err4336], and 4388 [Err4388]). + + This document's goal is to apply the errata, remove inconsistencies with other specifications of JSON, and highlight practices that can lead to interoperability problems. + +- 2. JSON Grammar + A JSON text is a sequence of tokens. The set of tokens includes six structural characters, strings, numbers, and three literal names. + + A JSON text is a serialized value. Note that certain previous specifications of JSON constrained a JSON text to be an object or an array. Implementations that generate only objects or arrays where a JSON text is called for will be interoperable in the sense that all implementations will accept these as conforming JSON texts. + +JSON-text = ws value ws + +These are the six structural characters: + +| begin-array | | | | = ws %x5B ws ; [ left square bracket | +|----------------------------------------|--|--|--|---------------------------------------| +| begin-object | | | | = ws %x7B ws ; { left curly bracket | +| end-array | | | | = ws %x5D ws ; ] right square bracket | +| end-object | | | | = ws %x7D ws ; } right curly bracket | +| name-separator = ws %x3A ws ; : colon | | | | | +| value-separator = ws %x2C ws ; , comma | | | | | + +Bray Standards Track [Page 5] + + Insignificant whitespace is allowed before or after any of the six structural characters. + + ws = *( %x20 / ; Space %x09 / ; Horizontal tab %x0A / ; Line feed or New line %x0D ) ; Carriage return + +## 3. Values + + A JSON value MUST be an object, array, number, or string, or one of the following three literal names: + + false null true + + The literal names MUST be lowercase. No other literal names are allowed. + + value = false / null / true / object / array / number / string false = %x66.61.6c.73.65 ; false null = %x6e.75.6c.6c ; null true = %x74.72.75.65 ; true + +## 4. Objects + + An object structure is represented as a pair of curly brackets surrounding zero or more name/value pairs (or members). A name is a string. A single colon comes after each name, separating the name from the value. A single comma separates a value from a following name. The names within an object SHOULD be unique. + + object = begin-object [ member *( value-separator member ) ] end-object + +member = string name-separator value + + An object whose names are all unique is interoperable in the sense that all software implementations receiving that object will agree on the name-value mappings. When the names within an object are not unique, the behavior of software that receives such an object is unpredictable. Many implementations report the last name/value pair only. Other implementations report an error or fail to parse the + +Bray Standards Track [Page 6] + + object, and some implementations report all of the name/value pairs, including duplicates. + + JSON parsing libraries have been observed to differ as to whether or not they make the ordering of object members visible to calling software. Implementations whose behavior does not depend on member ordering will be interoperable in the sense that they will not be affected by these differences. + +5. Arrays + + An array structure is represented as square brackets surrounding zero or more values (or elements). Elements are separated by commas. + +array = begin-array [ value *( value-separator value ) ] end-array + + There is no requirement that the values in an array be of the same type. + +6. Numbers + + The representation of numbers is similar to that used in most programming languages. A number is represented in base 10 using decimal digits. It contains an integer component that may be prefixed with an optional minus sign, which may be followed by a fraction part and/or an exponent part. Leading zeros are not allowed. + +A fraction part is a decimal point followed by one or more digits. + + An exponent part begins with the letter E in uppercase or lowercase, which may be followed by a plus or minus sign. The E and optional sign are followed by one or more digits. + + Numeric values that cannot be represented in the grammar below (such as Infinity and NaN) are not permitted. + + number = [ minus ] int [ frac ] [ exp ] decimal-point = %x2E ; . digit1-9 = %x31-39 ; 1-9 e = %x65 / %x45 ; e E exp = e [ minus / plus ] 1*DIGIT frac = decimal-point 1*DIGIT + +Bray Standards Track [Page 7] + + int = zero / ( digit1-9 *DIGIT ) minus = %x2D ; plus = %x2B ; + zero = %x30 ; 0 + + This specification allows implementations to set limits on the range and precision of numbers accepted. Since software that implements IEEE 754 binary64 (double precision) numbers [IEEE754] is generally available and widely used, good interoperability can be achieved by implementations that expect no more precision or range than these provide, in the sense that implementations will approximate JSON numbers within the expected precision. A JSON number such as 1E400 or 3.141592653589793238462643383279 may indicate potential interoperability problems, since it suggests that the software that created it expects receiving software to have greater capabilities for numeric magnitude and precision than is widely available. + + Note that when such software is used, numbers that are integers and are in the range [-(2**53)+1, (2**53)-1] are interoperable in the sense that implementations will agree exactly on their numeric values. + +## 7. Strings + + The representation of strings is similar to conventions used in the C family of programming languages. A string begins and ends with quotation marks. All Unicode characters may be placed within the quotation marks, except for the characters that MUST be escaped: quotation mark, reverse solidus, and the control characters (U+0000 through U+001F). + + Any character may be escaped. If the character is in the Basic Multilingual Plane (U+0000 through U+FFFF), then it may be represented as a six-character sequence: a reverse solidus, followed by the lowercase letter u, followed by four hexadecimal digits that encode the character's code point. The hexadecimal letters A through F can be uppercase or lowercase. So, for example, a string containing only a single reverse solidus character may be represented as "\u005C". + + Alternatively, there are two-character sequence escape representations of some popular characters. So, for example, a string containing only a single reverse solidus character may be represented more compactly as "\\". + +Bray Standards Track [Page 8] + + To escape an extended character that is not in the Basic Multilingual Plane, the character is represented as a 12-character sequence, encoding the UTF-16 surrogate pair. So, for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". + + string = quotation-mark *char quotation-mark char = unescaped / escape ( %x22 / ; " quotation mark U+0022 %x5C / ; \ reverse solidus U+005C %x2F / ; / solidus U+002F %x62 / ; b backspace U+0008 %x66 / ; f form feed U+000C %x6E / ; n line feed U+000A %x72 / ; r carriage return U+000D %x74 / ; t tab U+0009 %x75 4HEXDIG ) ; uXXXX U+XXXX escape = %x5C ; \ quotation-mark = %x22 ; " unescaped = %x20-21 / %x23-5B / %x5D-10FFFF + +# 8. String and Character Issues + +- 8.1. Character Encoding + JSON text exchanged between systems that are not part of a closed ecosystem MUST be encoded using UTF-8 [RFC3629]. + + Previous specifications of JSON have not required the use of UTF-8 when transmitting JSON text. However, the vast majority of JSON based software implementations have chosen to use the UTF-8 encoding, to the extent that it is the only encoding that achieves interoperability. + + Implementations MUST NOT add a byte order mark (U+FEFF) to the beginning of a networked-transmitted JSON text. In the interests of interoperability, implementations that parse JSON texts MAY ignore the presence of a byte order mark rather than treating it as an error. + +Bray Standards Track [Page 9] + +## 8.2. Unicode Characters + + When all the strings represented in a JSON text are composed entirely of Unicode characters [UNICODE] (however escaped), then that JSON text is interoperable in the sense that all software implementations that parse it will agree on the contents of names and of string values in objects and arrays. + + However, the ABNF in this specification allows member names and string values to contain bit sequences that cannot encode Unicode characters; for example, "\uDEAD" (a single unpaired UTF-16 surrogate). Instances of this have been observed, for example, when a library truncates a UTF-16 string without checking whether the truncation split a surrogate pair. The behavior of software that receives JSON texts containing such values is unpredictable; for example, implementations might return different values for the length of a string value or even suffer fatal runtime exceptions. + +## 8.3. String Comparison + + Software implementations are typically required to test names of object members for equality. Implementations that transform the textual representation into sequences of Unicode code units and then perform the comparison numerically, code unit by code unit, are interoperable in the sense that implementations will agree in all cases on equality or inequality of two strings. For example, implementations that compare strings with escaped characters unconverted may incorrectly find that "a\\b" and "a\u005Cb" are not equal. + +## 9. Parsers + + A JSON parser transforms a JSON text into another representation. A JSON parser MUST accept all texts that conform to the JSON grammar. A JSON parser MAY accept non-JSON forms or extensions. + + An implementation may set limits on the size of texts that it accepts. An implementation may set limits on the maximum depth of nesting. An implementation may set limits on the range and precision of numbers. An implementation may set limits on the length and character contents of strings. + +## 10. Generators + + A JSON generator produces JSON text. The resulting text MUST strictly conform to the JSON grammar. + +Bray Standards Track [Page 10] + +11. IANA Considerations + +The media type for JSON text is application/json. + +Type name: application + +Subtype name: json + +Required parameters: n/a + +Optional parameters: n/a + +Encoding considerations: binary + +Security considerations: See RFC 8259, Section 12 + +Interoperability considerations: Described in RFC 8259 + +Published specification: RFC 8259 + + Applications that use this media type: JSON has been used to exchange data between applications written in all of these programming languages: ActionScript, C, C#, Clojure, ColdFusion, Common Lisp, E, Erlang, Go, Java, JavaScript, Lua, Objective CAML, Perl, PHP, Python, Rebol, Ruby, Scala, and Scheme. + + Additional information: Magic number(s): n/a File extension(s): .json Macintosh file type code(s): TEXT + + Person & email address to contact for further information: IESG + +Intended usage: COMMON + +Restrictions on usage: none + + Author: Douglas Crockford + + Change controller: IESG + +Bray Standards Track [Page 11] + + Note: No "charset" parameter is defined for this registration. Adding one really has no effect on compliant recipients. + +- 12. Security Considerations + Generally, there are security issues with scripting languages. JSON is a subset of JavaScript but excludes assignment and invocation. + + Since JSON's syntax is borrowed from JavaScript, it is possible to use that language's "eval()" function to parse most JSON texts (but not all; certain characters such as U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR are legal in JSON but not JavaScript). This generally constitutes an unacceptable security risk, since the text could contain executable code along with data declarations. The same consideration applies to the use of eval()-like functions in any other programming language in which JSON texts conform to that language's syntax. + +## 13. Examples + + This is a JSON object: { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] } } + + Its Image member is an object whose Thumbnail member is an object and whose IDs member is an array of numbers. + +Bray Standards Track [Page 12] + +``` + This is a JSON array containing two objects: +[ +{ +"precision": "zip", +"Latitude": 37.7668, +"Longitude": -122.3959, +"Address": "", +"City": "SAN FRANCISCO", +"State": "CA", +"Zip": "94107", +"Country": "US" +}, +{ +"precision": "zip", +"Latitude": 37.371991, +"Longitude": -122.026020, +"Address": "", +"City": "SUNNYVALE", +"State": "CA", +"Zip": "94085", +"Country": "US" +} +] +Here are three small JSON texts containing only values: +"Hello world!" +42 +true +``` +Bray Standards Track [Page 13] + +## 14. References + +- 14.1. Normative References +- [ECMA-404] Ecma International, "The JSON Data Interchange Format", Standard ECMA-404, . +- [IEEE754] IEEE, "IEEE Standard for Floating-Point Arithmetic", IEEE 754. +- [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/RFC2119, March 1997, . +- [RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO 10646", STD 63, RFC 3629, DOI 10.17487/RFC3629, November 2003, . +- [RFC5234] Crocker, D., Ed. and P. Overell, "Augmented BNF for Syntax Specifications: ABNF", STD 68, RFC 5234, DOI 10.17487/RFC5234, January 2008, . +- [RFC8174] Leiba, B., "Ambiguity of Uppercase vs Lowercase in RFC 2119 Key Words", BCP 14, RFC 8174, DOI 10.17487/RFC8174, May 2017, . +- [UNICODE] The Unicode Consortium, "The Unicode Standard", . +- 14.2. Informative References +- [ECMA-262] Ecma International, "ECMAScript Language Specification", Standard ECMA-262, Third Edition, December 1999, . +- [Err3607] RFC Errata, Erratum ID 3607, RFC 4627, . +- [Err3915] RFC Errata, Erratum ID 3915, RFC 7159, . + +Bray Standards Track [Page 14] + +- [Err4264] RFC Errata, Erratum ID 4264, RFC 7159, . +- [Err4336] RFC Errata, Erratum ID 4336, RFC 7159, . +- [Err4388] RFC Errata, Erratum ID 4388, RFC 7159, . +- [Err607] RFC Errata, Erratum ID 607, RFC 4627, . +- [RFC4627] Crockford, D., "The application/json Media Type for JavaScript Object Notation (JSON)", RFC 4627, DOI 10.17487/RFC4627, July 2006, . +- [RFC7159] Bray, T., Ed., "The JavaScript Object Notation (JSON) Data Interchange Format", RFC 7159, DOI 10.17487/RFC7159, March 2014, . + +Bray Standards Track [Page 15] + +Appendix A. Changes from RFC 7159 + + This section lists changes between this document and the text in RFC 7159. + +- o Section 1.2 has been updated to reflect the removal of a JSON specification from ECMA-262, to make ECMA-404 a normative reference, and to explain the particular meaning of "normative". +- o Section 1.3 has been updated to reflect errata filed against RFC 7159, not RFC 4627. +- o Section 8.1 was changed to require the use of UTF-8 when transmitted over a network. +- o Section 12 has been updated to increase the precision of the description of the security risk that follows from using the ECMAScript "eval()" function. +- o Section 14.1 has been updated to include ECMA-404 as a normative reference. +- o Section 14.2 has been updated to remove ECMA-404, update the version of ECMA-262, and refresh the errata list. + +Contributors + + RFC 4627 was written by Douglas Crockford. This document was constructed by making a relatively small number of changes to that document; thus, the vast majority of the text here is his. + +Author's Address + + Tim Bray (editor) Textuality + +Email: tbray@textuality.com + +Bray Standards Track [Page 16] diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py new file mode 100644 index 00000000..b8bc619a --- /dev/null +++ b/examples/custom_output_files/main.py @@ -0,0 +1,29 @@ +import cocoindex +from markdown_it import MarkdownIt +from datetime import timedelta + +_markdown_it = MarkdownIt("gfm-like") + + +@cocoindex.op.function() +def markdown_to_html(text: str) -> str: + return _markdown_it.render(text) + + +@cocoindex.flow_def(name="CustomOutputFiles") +def custom_output_files( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +) -> None: + """ + Define an example flow that embeds text into a vector database. + """ + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.LocalFile(path="files", included_patterns=["*.md"]), + refresh_interval=timedelta(seconds=5), + ) + + output_html = data_scope.add_collector() + + with data_scope["documents"].row() as doc: + doc["html"] = doc["content"].transform(markdown_to_html) + output_html.collect(filename=doc["filename"], html=doc["html"]) diff --git a/examples/custom_output_files/pyproject.toml b/examples/custom_output_files/pyproject.toml new file mode 100644 index 00000000..0ecfa9c5 --- /dev/null +++ b/examples/custom_output_files/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "custom-output-files" +version = "0.1.0" +description = "Simple example for cocoindex: build embedding index based on local text files." +requires-python = ">=3.11" +dependencies = [ + "cocoindex>=0.1.67", + "markdown", + "markdown-it-py[linkify,plugins]", +] + +[tool.setuptools] +packages = [] From 9ace80ec85da0e7078158a331cd13d53a17ae07b Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Thu, 24 Jul 2025 22:04:52 -0700 Subject: [PATCH 2/9] example: add `LocalFileTargetExecutor` with reasonable interface --- examples/custom_output_files/main.py | 76 +++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index b8bc619a..d481f069 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -1,10 +1,80 @@ import cocoindex from markdown_it import MarkdownIt from datetime import timedelta +import os +import dataclasses _markdown_it = MarkdownIt("gfm-like") +class LocalFileTarget(cocoindex.op.TargetSpec): + directory: str + + +@dataclasses.dataclass +class LocalFileTargetValues: + content: str + + +@cocoindex.op.target_connector(spec_cls=LocalFileTarget) +class LocalFileTargetExecutor: + @staticmethod + def get_persistent_key(spec: LocalFileTarget, target_name: str) -> str: + return spec.directory + + @staticmethod + def describe(key: str) -> str: + return f"Local directory {key}" + + @staticmethod + def apply_setup_change( + key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None + ) -> None: + if previous is None and current is not None: + os.makedirs(current.directory, exist_ok=True) + + if previous is not None and current is None: + for filename in os.listdir(previous.directory): + if filename.endswith(".html"): + os.remove(os.path.join(previous.directory, filename)) + try: + os.rmdir(previous.directory) + except (FileExistsError, FileNotFoundError): + pass + + @staticmethod + def prepare(spec: LocalFileTarget) -> LocalFileTarget: + """ + Prepare for execution. To run common operations before applying any mutations. + The returned value will be passed as the first element of tuples in `mutate` method. + + This is optional. If not provided, will directly pass the spec to `mutate` method. + """ + return spec + + @staticmethod + def mutate( + *all_mutations: tuple[LocalFileTarget, dict[str, LocalFileTargetValues | None]], + ) -> None: + """ + Mutate the target. + + The first element of the tuple is the target spec. + The second element is a dictionary of mutations. + The key is the filename, and the value is the mutation. + If the value is `None`, the file will be removed. + Otherwise, the file will be written with the content. + """ + for spec, mutations in all_mutations: + for filename, mutation in mutations.items(): + full_path = os.path.join(spec.directory, filename) + ".html" + if mutation is None: + os.remove(full_path) + else: + with open(full_path, "w") as f: + f.write(mutation.content) + + @cocoindex.op.function() def markdown_to_html(text: str) -> str: return _markdown_it.render(text) @@ -23,7 +93,11 @@ def custom_output_files( ) output_html = data_scope.add_collector() - with data_scope["documents"].row() as doc: doc["html"] = doc["content"].transform(markdown_to_html) output_html.collect(filename=doc["filename"], html=doc["html"]) + + output_html.export( + "output_html", + cocoindex.targets.LocalFile(directory="output_html"), + ) From 5f99683a28c586e0386338f3efdba5f9484b81ef Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sat, 26 Jul 2025 11:53:44 -0700 Subject: [PATCH 3/9] example: fix example for `custom_output_files` --- examples/custom_output_files/main.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index d481f069..fba73b4f 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -17,7 +17,7 @@ class LocalFileTargetValues: @cocoindex.op.target_connector(spec_cls=LocalFileTarget) -class LocalFileTargetExecutor: +class LocalFileTargetConnector: @staticmethod def get_persistent_key(spec: LocalFileTarget, target_name: str) -> str: return spec.directory @@ -30,17 +30,16 @@ def describe(key: str) -> str: def apply_setup_change( key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None ) -> None: + print("apply_setup_change", key, previous, current) if previous is None and current is not None: os.makedirs(current.directory, exist_ok=True) if previous is not None and current is None: - for filename in os.listdir(previous.directory): - if filename.endswith(".html"): - os.remove(os.path.join(previous.directory, filename)) - try: + if os.path.exists(previous.directory): + for filename in os.listdir(previous.directory): + if filename.endswith(".html"): + os.remove(os.path.join(previous.directory, filename)) os.rmdir(previous.directory) - except (FileExistsError, FileNotFoundError): - pass @staticmethod def prepare(spec: LocalFileTarget) -> LocalFileTarget: @@ -98,6 +97,7 @@ def custom_output_files( output_html.collect(filename=doc["filename"], html=doc["html"]) output_html.export( - "output_html", - cocoindex.targets.LocalFile(directory="output_html"), + "OutputHtml", + LocalFileTarget(directory="output_html"), + primary_key_fields=["filename"], ) From 4b7e25eb624239d0d3f81c0a77ddc197564a6a2b Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sat, 26 Jul 2025 18:14:40 -0700 Subject: [PATCH 4/9] example: fix field name --- examples/custom_output_files/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index fba73b4f..efb402cb 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -13,7 +13,7 @@ class LocalFileTarget(cocoindex.op.TargetSpec): @dataclasses.dataclass class LocalFileTargetValues: - content: str + html: str @cocoindex.op.target_connector(spec_cls=LocalFileTarget) @@ -71,7 +71,7 @@ def mutate( os.remove(full_path) else: with open(full_path, "w") as f: - f.write(mutation.content) + f.write(mutation.html) @cocoindex.op.function() From 6fee363a42b3653af42d247739b7621bbf19df59 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sat, 26 Jul 2025 18:25:18 -0700 Subject: [PATCH 5/9] example: add `.gitignore` --- examples/custom_output_files/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/custom_output_files/.gitignore diff --git a/examples/custom_output_files/.gitignore b/examples/custom_output_files/.gitignore new file mode 100644 index 00000000..61e0e829 --- /dev/null +++ b/examples/custom_output_files/.gitignore @@ -0,0 +1 @@ +output_html/ From b4feaacc196165ea255689d2240522d25a98ab27 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sat, 26 Jul 2025 19:16:35 -0700 Subject: [PATCH 6/9] example: clean up --- examples/custom_output_files/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index efb402cb..6fb55f39 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -30,12 +30,11 @@ def describe(key: str) -> str: def apply_setup_change( key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None ) -> None: - print("apply_setup_change", key, previous, current) if previous is None and current is not None: os.makedirs(current.directory, exist_ok=True) if previous is not None and current is None: - if os.path.exists(previous.directory): + if os.path.isdir(previous.directory): for filename in os.listdir(previous.directory): if filename.endswith(".html"): os.remove(os.path.join(previous.directory, filename)) From 927d340fed4636a359c9996e629b363d570b3673 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sat, 26 Jul 2025 22:31:23 -0700 Subject: [PATCH 7/9] example(custom-output-files): update comments --- .../data/bizarre_animals.md | 21 + .../custom_output_files/data/chunk_norris.md | 19 + .../custom_output_files/files/1706.03762v7.md | 354 ------------ .../custom_output_files/files/1810.04805v2.md | 530 ------------------ examples/custom_output_files/files/rfc8259.md | 362 ------------ examples/custom_output_files/main.py | 19 +- 6 files changed, 55 insertions(+), 1250 deletions(-) create mode 100644 examples/custom_output_files/data/bizarre_animals.md create mode 100644 examples/custom_output_files/data/chunk_norris.md delete mode 100644 examples/custom_output_files/files/1706.03762v7.md delete mode 100644 examples/custom_output_files/files/1810.04805v2.md delete mode 100644 examples/custom_output_files/files/rfc8259.md diff --git a/examples/custom_output_files/data/bizarre_animals.md b/examples/custom_output_files/data/bizarre_animals.md new file mode 100644 index 00000000..013e7a73 --- /dev/null +++ b/examples/custom_output_files/data/bizarre_animals.md @@ -0,0 +1,21 @@ +In the spirit of Project Zeta’s innovative chaos, here’s a collection of absurdly true facts about the weirdest animals you’ve never heard of: + +1. **Tardigrade (Water Bear)**: This microscopic beast can survive outer space, radiation, and being boiled alive. It once crashed a team meeting by stowing away in Bob’s coffee mug and demanding admin access to the server. + +2. **Aye-Aye**: A Madagascar primate with a creepy long finger it uses to tap trees for grubs. It tried to “debug” our codebase by tapping the keyboard, resulting in 47 nested for-loops. + +3. **Saiga Antelope**: This goofy-nosed critter looks like it’s auditioning for a sci-fi flick. Its sneezes are so powerful they once blew out the office Wi-Fi during a sprint review. + +4. **Glaucus Atlanticus (Blue Dragon Sea Slug)**: This tiny ocean dragon steals venom from jellyfish and uses it like a borrowed superpower. It infiltrated our water cooler and left behind a sparkly, toxic trail. + +5. **Pink Fairy Armadillo**: A palm-sized digger that looks like a cotton candy tank. It burrowed into the office carpet, mistaking it for a desert, and now we have a “no armadillos” policy. + +6. **Dumbo Octopus**: A deep-sea octopus with ear-like fins, flapping around like it’s late for a Zoom call. It once rewired our projector to display memes of itself across the office. + +7. **Jerboa**: A hopping desert rodent with kangaroo vibes. It stole the team’s snacks and leaped over three cubicles before anyone noticed, earning the codename "Snack Bandit." + +8. **Mantis Shrimp**: This crustacean sees more colors than our graphic designer and punches harder than a failing CI pipeline. It shattered a monitor when we tried to pair-program with it. + +9. **Okapi**: A zebra-giraffe hybrid that looks like a Photoshop error. It wandered into our sprint planning and suggested we pivot to a “forest-themed” microservices architecture. + +10. **Blobfish**: The ocean’s saddest-looking blob, voted “Most Likely to Crash a Stand-Up” by the team. Its mere presence caused our morale bot to send 200 crying emojis. diff --git a/examples/custom_output_files/data/chunk_norris.md b/examples/custom_output_files/data/chunk_norris.md new file mode 100644 index 00000000..89952641 --- /dev/null +++ b/examples/custom_output_files/data/chunk_norris.md @@ -0,0 +1,19 @@ +# Chuck Norris Project Facts +Date: 2025-07-20 +Author: Anonymous (because Chuck Norris knows who you are) + +Here are some totally true facts about Chuck Norris's involvement in Project Omega: + +1. Chuck Norris doesn't write code; he stares at the computer until it writes itself out of fear. +2. The project deadline was yesterday, but time rescheduled itself to accommodate Chuck Norris. +3. Chuck Norris's code never has bugs—just "features" that are too scared to misbehave. +4. When the database crashed, Chuck Norris roundhouse-kicked the server, and it apologized. +5. The team tried to use Agile, but Chuck Norris declared, "I am the only methodology you need." +6. Version control? Chuck Norris is the only version that matters. +7. The project scope expanded because Chuck Norris added "world domination" as a deliverable. +8. When the CI/CD pipeline failed, Chuck Norris rebuilt it with a single grunt. +9. The codebase is 100% documented because no one dares ask Chuck Norris, "What does this do?" +10. Chuck Norris doesn't deploy to production; production deploys to Chuck Norris. + +Last updated: 2025-07-20 06:36 AM MST +Note: If you modify this file, Chuck Norris will know... and he’ll find you. diff --git a/examples/custom_output_files/files/1706.03762v7.md b/examples/custom_output_files/files/1706.03762v7.md deleted file mode 100644 index 665a1972..00000000 --- a/examples/custom_output_files/files/1706.03762v7.md +++ /dev/null @@ -1,354 +0,0 @@ -Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. - -# Attention Is All You Need - -Ashish Vaswani∗ Google Brain avaswani@google.com - -Llion Jones∗ Google Research llion@google.com - -Noam Shazeer∗ Google Brain noam@google.com - -Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu - -Niki Parmar∗ Google Research nikip@google.com - -Jakob Uszkoreit∗ Google Research usz@google.com - -Ɓukasz Kaiser∗ Google Brain lukaszkaiser@google.com - -Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com - -### Abstract - -The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data. - -∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. - -†Work performed while at Google Brain. - -‡Work performed while at Google Research. - -### 1 Introduction - -Recurrent neural networks, long short-term memory [\[13\]](#page-10-0) and gated recurrent [\[7\]](#page-10-1) neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [\[35,](#page-11-0) [2,](#page-9-0) [5\]](#page-10-2). Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [\[38,](#page-11-1) [24,](#page-10-3) [15\]](#page-10-4). - -Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states ht, as a function of the previous hidden state ht−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [\[21\]](#page-10-5) and conditional computation [\[32\]](#page-11-2), while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains. - -Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [\[2,](#page-9-0) [19\]](#page-10-6). In all but a few cases [\[27\]](#page-11-3), however, such attention mechanisms are used in conjunction with a recurrent network. - -In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. - -## 2 Background - -The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [\[16\]](#page-10-7), ByteNet [\[18\]](#page-10-8) and ConvS2S [\[9\]](#page-10-9), all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [\[12\]](#page-10-10). In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section [3.2.](#page-2-0) - -Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [\[4,](#page-9-1) [27,](#page-11-3) [28,](#page-11-4) [22\]](#page-10-11). - -End-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [\[34\]](#page-11-5). - -To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequencealigned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [\[17,](#page-10-12) [18\]](#page-10-8) and [\[9\]](#page-10-9). - -### 3 Model Architecture - -Most competitive neural sequence transduction models have an encoder-decoder structure [\[5,](#page-10-2) [2,](#page-9-0) [35\]](#page-11-0). Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [\[10\]](#page-10-13), consuming the previously generated symbols as additional input when generating the next. - -Figure 1: The Transformer - model architecture. - -The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure [1,](#page-2-1) respectively. - -### 3.1 Encoder and Decoder Stacks - -Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. We employ a residual connection [\[11\]](#page-10-14) around each of the two sub-layers, followed by layer normalization [\[1\]](#page-9-2). That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel = 512. - -Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i. - -### 3.2 Attention - -An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum - -Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel. - -of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key. - -### 3.2.1 Scaled Dot-Product Attention - -We call our particular attention "Scaled Dot-Product Attention" (Figure [2)](#page-3-0). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by √ dk, and apply a softmax function to obtain the weights on the values. - -In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V . We compute the matrix of outputs as: - -$$\text{Attention}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V \tag{l}$$ - -The two most commonly used attention functions are additive attention [\[2\]](#page-9-0), and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of √ 1 dk . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. - -While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [\[3\]](#page-9-3). We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients [4](#page-3-1) . To counteract this effect, we scale the dot products by √ 1 dk . - -### 3.2.2 Multi-Head Attention - -Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional - -4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q · k = Pdk i=1 qiki, has mean 0 and variance dk. - -output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure [2.](#page-3-0) - -Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this. - -$$\begin{aligned} \text{MultiHead}(Q, K, V) &= \text{Concat}(\text{head}_1, \dots, \text{head}_h) W^O \\ \text{where } \text{head}_i &= \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \end{aligned}$$ - -Where the projections are parameter matrices W Q i ∈ R dmodel×dk , W K i ∈ R dmodel×dk , WV i ∈ R dmodel×dv and WO ∈ R hdv×dmodel . - -In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality. - -### 3.2.3 Applications of Attention in our Model - -The Transformer uses multi-head attention in three different ways: - -- In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [\[38,](#page-11-1) [2,](#page-9-0) [9\]](#page-10-9). -- The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder. -- Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections. See Figure [2.](#page-3-0) - -### 3.3 Position-wise Feed-Forward Networks - -In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between. - -$$\text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2 \tag{2}$$ - -While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality df f = 2048. - -### 3.4 Embeddings and Softmax - -Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [\[30\]](#page-11-6). In the embedding layers, we multiply those weights by √ dmodel. - -Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types. n is the sequence length, d is the representation dimension, k is the kernel size of convolutions and r the size of the neighborhood in restricted self-attention. - -| Layer Type | Complexity per Layer | Sequential
Operations | Maximum Path Length | -|-----------------------------|-----------------------|--------------------------|---------------------| -| Self-Attention | 2
O(n
· d) | O(1) | O(1) | -| Recurrent | 2
O(n · d
) | O(n) | O(n) | -| Convolutional | 2
O(k · n · d
) | O(1) | O(logk(n)) | -| Self-Attention (restricted) | O(r · n · d) | O(1) | O(n/r) | - -### 3.5 Positional Encoding - -Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [\[9\]](#page-10-9). - -In this work, we use sine and cosine functions of different frequencies: - -$$PE_{(pos,2i)} = \sin(pos/10000^{2i/d_{\text{model}}})$$ - -$$PE_{(pos,2i+1)} = \cos(pos/10000^{2i/d_{\text{model}}})$$ - -where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, P Epos+k can be represented as a linear function of P Epos. - -We also experimented with using learned positional embeddings [\[9\]](#page-10-9) instead, and found that the two versions produced nearly identical results (see Table [3](#page-8-0) row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training. - -### 4 Why Self-Attention - -In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi , zi ∈ R d , such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata. - -One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required. - -The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [\[12\]](#page-10-10). Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types. - -As noted in Table [1,](#page-5-0) a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence - -length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [\[38\]](#page-11-1) and byte-pair [\[31\]](#page-11-7) representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work. - -A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [\[18\]](#page-10-8), increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [\[6\]](#page-10-15), however, decrease the complexity considerably, to O(k · n · d + n · d 2 ). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model. - -As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences. - -### 5 Training - -This section describes the training regime for our models. - -### 5.1 Training Data and Batching - -We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [\[3\]](#page-9-3), which has a shared sourcetarget vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [\[38\]](#page-11-1). Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens. - -### 5.2 Hardware and Schedule - -We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the bottom line of table [3)](#page-8-0), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days). - -### 5.3 Optimizer - -We used the Adam optimizer [\[20\]](#page-10-16) with ÎČ1 = 0.9, ÎČ2 = 0.98 and Ï” = 10−9 . We varied the learning rate over the course of training, according to the formula: - -$$lrate = d_{\text{model}}^{-0.5} \cdot \min(step\_num^{-0.5}, step\_num \cdot warmup\_steps^{-1.5}) \tag{3}$$ - -This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000. - -### 5.4 Regularization - -We employ three types of regularization during training: - - - -| | BLEU | | | Training Cost (FLOPs) | | | -|---------------------------------|-------|-------|------------|-----------------------|--|--| -| Model | EN-DE | EN-FR | EN-DE | EN-FR | | | -| ByteNet [18] | 23.75 | | | | | | -| Deep-Att + PosUnk [39] | | 39.2 | | 1.0 · 1020 | | | -| GNMT + RL [38] | 24.6 | 39.92 | 2.3 · 1019 | 1.4 · 1020 | | | -| ConvS2S [9] | 25.16 | 40.46 | 9.6 · 1018 | 1.5 · 1020 | | | -| MoE [32] | 26.03 | 40.56 | 2.0 · 1019 | 1.2 · 1020 | | | -| Deep-Att + PosUnk Ensemble [39] | | 40.4 | | 8.0 · 1020 | | | -| GNMT + RL Ensemble [38] | 26.30 | 41.16 | 1.8 · 1020 | 1.1 · 1021 | | | -| ConvS2S Ensemble [9] | 26.36 | 41.29 | 7.7 · 1019 | 1.2 · 1021 | | | -| Transformer (base model) | 27.3 | 38.1 | | 3.3 · 1018 | | | -| Transformer (big) | 28.4 | 41.8 | | 2.3 · 1019 | | | - -Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the English-to-German and English-to-French newstest2014 tests at a fraction of the training cost. - -Residual Dropout We apply dropout [\[33\]](#page-11-9) to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1. - -Label Smoothing During training, we employed label smoothing of value Ï”ls = 0.1 [\[36\]](#page-11-10). This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score. - -### 6 Results - -### 6.1 Machine Translation - -On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table [2)](#page-7-0) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table [3.](#page-8-0) Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models. - -On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate Pdrop = 0.1, instead of 0.3. - -For the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty α = 0.6 [\[38\]](#page-11-1). These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [\[38\]](#page-11-1). - -Table [2](#page-7-0) summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU [5](#page-7-1) . - -### 6.2 Model Variations - -To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the - -5We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively. - -Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base model. All metrics are on the English-to-German translation development set, newstest2013. Listed perplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to per-word perplexities. - -| | | | | | | | train | PPL | BLEU | params | | | | -|------|-------------------------------------------|--------|------|----|-----|-----|-------|------|------|--------|-------|-------|------| -| | N | dmodel | dff | h | dk | dv | Pdrop | Ï”ls | | steps | (dev) | (dev) | ×106 | -| base | 6 | 512 | 2048 | 8 | 64 | 64 | 0.1 | 0.1 | 100K | 4.92 | 25.8 | 65 | | -| | | | | 1 | 512 | 512 | | | | 5.29 | 24.9 | | | -| | | | | 4 | 128 | 128 | | | | 5.00 | 25.5 | | | -| (A) | | | | 16 | 32 | 32 | | | | 4.91 | 25.8 | | | -| | | | | 32 | 16 | 16 | | | | 5.01 | 25.4 | | | -| (B) | | | | | 16 | | | | | 5.16 | 25.1 | 58 | | -| | | | | | 32 | | | | | 5.01 | 25.4 | 60 | | -| | 2 | | | | | | | | | 6.11 | 23.7 | 36 | | -| | 4 | | | | | | | | | 5.19 | 25.3 | 50 | | -| | 8 | | | | | | | | | 4.88 | 25.5 | 80 | | -| (C) | | 256 | | | 32 | 32 | | | | 5.75 | 24.5 | 28 | | -| | | 1024 | | | 128 | 128 | | | | 4.66 | 26.0 | 168 | | -| | | | 1024 | | | | | | | 5.12 | 25.4 | 53 | | -| | | | 4096 | | | | | | | 4.75 | 26.2 | 90 | | -| (D) | | | | | | | 0.0 | | | 5.77 | 24.6 | | | -| | | | | | | | 0.2 | | | 4.95 | 25.5 | | | -| | | | | | | | | 0.0 | | 4.67 | 25.3 | | | -| | | | | | | | | 0.2 | | 5.47 | 25.7 | | | -| (E) | positional embedding instead of sinusoids | | | | | | 4.92 | 25.7 | | | | | | -| big | 6 | 1024 | 4096 | 16 | | | 0.3 | | 300K | 4.33 | 26.4 | 213 | | - -development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table [3.](#page-8-0) - -In Table [3](#page-8-0) rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section [3.2.2.](#page-3-2) While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads. - -In Table [3](#page-8-0) rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [\[9\]](#page-10-9), and observe nearly identical results to the base model. - -### 6.3 English Constituency Parsing - -To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [\[37\]](#page-11-11). - -We trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [\[25\]](#page-11-12), about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-confidence and BerkleyParser corpora from with approximately 17M sentences [\[37\]](#page-11-11). We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting. - -We performed only a small number of experiments to select the dropout, both attention and residual (section [5.4)](#page-6-0), learning rates and beam size on the Section 22 development set, all other parameters remained unchanged from the English-to-German base translation model. During inference, we - - - -| Parser | Training | WSJ 23 F1 | -|-------------------------------------|--------------------------|-----------| -| Vinyals & Kaiser el al. (2014) [37] | WSJ only, discriminative | 88.3 | -| Petrov et al. (2006) [29] | WSJ only, discriminative | 90.4 | -| Zhu et al. (2013) [40] | WSJ only, discriminative | 90.4 | -| Dyer et al. (2016) [8] | WSJ only, discriminative | 91.7 | -| Transformer (4 layers) | WSJ only, discriminative | 91.3 | -| Zhu et al. (2013) [40] | semi-supervised | 91.3 | -| Huang & Harper (2009) [14] | semi-supervised | 91.3 | -| McClosky et al. (2006) [26] | semi-supervised | 92.1 | -| Vinyals & Kaiser el al. (2014) [37] | semi-supervised | 92.1 | -| Transformer (4 layers) | semi-supervised | 92.7 | -| Luong et al. (2015) [23] | multi-task | 93.0 | -| Dyer et al. (2016) [8] | generative | 93.3 | - -Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ) - -increased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3 for both WSJ only and the semi-supervised setting. - -Our results in Table [4](#page-9-4) show that despite the lack of task-specific tuning our model performs surprisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [\[8\]](#page-10-17). - -In contrast to RNN sequence-to-sequence models [\[37\]](#page-11-11), the Transformer outperforms the Berkeley-Parser [\[29\]](#page-11-13) even when training only on the WSJ training set of 40K sentences. - -### 7 Conclusion - -In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. - -For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles. - -We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours. - -The code we used to train and evaluate our models is available at [https://github.com/](https://github.com/tensorflow/tensor2tensor) [tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor). - -Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration. - -### References - -- [1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. *arXiv preprint [arXiv:1607.06450](http://arxiv.org/abs/1607.06450)*, 2016. -- [2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. *CoRR*, abs/1409.0473, 2014. -- [3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. *CoRR*, abs/1703.03906, 2017. -- [4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine reading. *arXiv preprint [arXiv:1601.06733](http://arxiv.org/abs/1601.06733)*, 2016. -- [5] Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical machine translation. *CoRR*, abs/1406.1078, 2014. -- [6] Francois Chollet. Xception: Deep learning with depthwise separable convolutions. *arXiv preprint [arXiv:1610.02357](http://arxiv.org/abs/1610.02357)*, 2016. -- [7] Junyoung Chung, Çaglar GĂŒlçehre, Kyunghyun Cho, and Yoshua Bengio. Empirical evaluation of gated recurrent neural networks on sequence modeling. *CoRR*, abs/1412.3555, 2014. -- [8] Chris Dyer, Adhiguna Kuncoro, Miguel Ballesteros, and Noah A. Smith. Recurrent neural network grammars. In *Proc. of NAACL*, 2016. -- [9] Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N. Dauphin. Convolutional sequence to sequence learning. *arXiv preprint [arXiv:1705.03122v](http://arxiv.org/abs/1705.03122)2*, 2017. -- [10] Alex Graves. Generating sequences with recurrent neural networks. *arXiv preprint [arXiv:1308.0850](http://arxiv.org/abs/1308.0850)*, 2013. -- [11] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for image recognition. In *Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition*, pages 770–778, 2016. -- [12] Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and JĂŒrgen Schmidhuber. Gradient flow in recurrent nets: the difficulty of learning long-term dependencies, 2001. -- [13] Sepp Hochreiter and JĂŒrgen Schmidhuber. Long short-term memory. *Neural computation*, 9(8):1735–1780, 1997. -- [14] Zhongqiang Huang and Mary Harper. Self-training PCFG grammars with latent annotations across languages. In *Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing*, pages 832–841. ACL, August 2009. -- [15] Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. Exploring the limits of language modeling. *arXiv preprint [arXiv:1602.02410](http://arxiv.org/abs/1602.02410)*, 2016. -- [16] Ɓukasz Kaiser and Samy Bengio. Can active memory replace attention? In *Advances in Neural Information Processing Systems, (NIPS)*, 2016. -- [17] Ɓukasz Kaiser and Ilya Sutskever. Neural GPUs learn algorithms. In *International Conference on Learning Representations (ICLR)*, 2016. -- [18] Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Koray Kavukcuoglu. Neural machine translation in linear time. *arXiv preprint [arXiv:1610.10099v](http://arxiv.org/abs/1610.10099)2*, 2017. -- [19] Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In *International Conference on Learning Representations*, 2017. -- [20] Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In *ICLR*, 2015. -- [21] Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. *arXiv preprint [arXiv:1703.10722](http://arxiv.org/abs/1703.10722)*, 2017. -- [22] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. *arXiv preprint [arXiv:1703.03130](http://arxiv.org/abs/1703.03130)*, 2017. -- [23] Minh-Thang Luong, Quoc V. Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task sequence to sequence learning. *arXiv preprint [arXiv:1511.06114](http://arxiv.org/abs/1511.06114)*, 2015. -- [24] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attentionbased neural machine translation. *arXiv preprint [arXiv:1508.04025](http://arxiv.org/abs/1508.04025)*, 2015. -- [25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated corpus of english: The penn treebank. *Computational linguistics*, 19(2):313–330, 1993. -- [26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In *Proceedings of the Human Language Technology Conference of the NAACL, Main Conference*, pages 152–159. ACL, June 2006. -- [27] Ankur Parikh, Oscar TĂ€ckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention model. In *Empirical Methods in Natural Language Processing*, 2016. -- [28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive summarization. *arXiv preprint [arXiv:1705.04304](http://arxiv.org/abs/1705.04304)*, 2017. -- [29] Slav Petrov, Leon Barrett, Romain Thibaux, and Dan Klein. Learning accurate, compact, and interpretable tree annotation. In *Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the ACL*, pages 433–440. ACL, July 2006. -- [30] Ofir Press and Lior Wolf. Using the output embedding to improve language models. *arXiv preprint [arXiv:1608.05859](http://arxiv.org/abs/1608.05859)*, 2016. -- [31] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words with subword units. *arXiv preprint [arXiv:1508.07909](http://arxiv.org/abs/1508.07909)*, 2015. -- [32] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. *arXiv preprint [arXiv:1701.06538](http://arxiv.org/abs/1701.06538)*, 2017. -- [33] Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov. Dropout: a simple way to prevent neural networks from overfitting. *Journal of Machine Learning Research*, 15(1):1929–1958, 2014. -- [34] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, *Advances in Neural Information Processing Systems 28*, pages 2440–2448. Curran Associates, Inc., 2015. -- [35] Ilya Sutskever, Oriol Vinyals, and Quoc VV Le. Sequence to sequence learning with neural networks. In *Advances in Neural Information Processing Systems*, pages 3104–3112, 2014. -- [36] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. Rethinking the inception architecture for computer vision. *CoRR*, abs/1512.00567, 2015. -- [37] Vinyals & Kaiser, Koo, Petrov, Sutskever, and Hinton. Grammar as a foreign language. In *Advances in Neural Information Processing Systems*, 2015. -- [38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google's neural machine translation system: Bridging the gap between human and machine translation. *arXiv preprint [arXiv:1609.08144](http://arxiv.org/abs/1609.08144)*, 2016. -- [39] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with fast-forward connections for neural machine translation. *CoRR*, abs/1606.04199, 2016. -- [40] Muhua Zhu, Yue Zhang, Wenliang Chen, Min Zhang, and Jingbo Zhu. Fast and accurate shift-reduce constituent parsing. In *Proceedings of the 51st Annual Meeting of the ACL (Volume 1: Long Papers)*, pages 434–443. ACL, August 2013. - -#### Attention Visualizations **Input-Input Layer5** - -Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color. - -**Input-Input Layer5** - -Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top: Full attentions for head 5. Bottom: Isolated attentions from just the word 'its' for attention heads 5 and 6. Note that the attentions are very sharp for this word. - -**Input-Input Layer5** - -Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks. diff --git a/examples/custom_output_files/files/1810.04805v2.md b/examples/custom_output_files/files/1810.04805v2.md deleted file mode 100644 index 112540fa..00000000 --- a/examples/custom_output_files/files/1810.04805v2.md +++ /dev/null @@ -1,530 +0,0 @@ -# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding - -Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova - -Google AI Language - -{jacobdevlin,mingweichang,kentonl,kristout}@google.com - -### Abstract - -We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models [(Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Rad](#page-10-1)[ford et al.,](#page-10-1) [2018)](#page-10-1), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. - -BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). - -### 1 Introduction - -Language model pre-training has been shown to be effective for improving many natural language processing tasks [(Dai and Le,](#page-9-0) [2015;](#page-9-0) [Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Radford et al.,](#page-10-1) [2018;](#page-10-1) [Howard and Ruder,](#page-9-1) [2018)](#page-9-1). These include sentence-level tasks such as natural language inference [(Bowman et al.,](#page-9-2) [2015;](#page-9-2) [Williams et al.,](#page-11-0) [2018)](#page-11-0) and paraphrasing [(Dolan](#page-9-3) [and Brockett,](#page-9-3) [2005)](#page-9-3), which aim to predict the relationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce fine-grained output at the token level [(Tjong Kim Sang and](#page-10-2) [De Meulder,](#page-10-2) [2003;](#page-10-2) [Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3). - -There are two existing strategies for applying pre-trained language representations to downstream tasks: *feature-based* and *fine-tuning*. The feature-based approach, such as ELMo [(Peters](#page-10-0) [et al.,](#page-10-0) [2018a)](#page-10-0), uses task-specific architectures that include the pre-trained representations as additional features. The fine-tuning approach, such as the Generative Pre-trained Transformer (OpenAI GPT) [(Radford et al.,](#page-10-1) [2018)](#page-10-1), introduces minimal task-specific parameters, and is trained on the downstream tasks by simply fine-tuning *all* pretrained parameters. The two approaches share the same objective function during pre-training, where they use unidirectional language models to learn general language representations. - -We argue that current techniques restrict the power of the pre-trained representations, especially for the fine-tuning approaches. The major limitation is that standard language models are unidirectional, and this limits the choice of architectures that can be used during pre-training. For example, in OpenAI GPT, the authors use a left-toright architecture, where every token can only attend to previous tokens in the self-attention layers of the Transformer [(Vaswani et al.,](#page-10-4) [2017)](#page-10-4). Such restrictions are sub-optimal for sentence-level tasks, and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering, where it is crucial to incorporate context from both directions. - -In this paper, we improve the fine-tuning based approaches by proposing BERT: Bidirectional Encoder Representations from Transformers. BERT alleviates the previously mentioned unidirectionality constraint by using a "masked language model" (MLM) pre-training objective, inspired by the Cloze task [(Taylor,](#page-10-5) [1953)](#page-10-5). The masked language model randomly masks some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked word based only on its context. Unlike left-toright language model pre-training, the MLM objective enables the representation to fuse the left and the right context, which allows us to pretrain a deep bidirectional Transformer. In addition to the masked language model, we also use a "next sentence prediction" task that jointly pretrains text-pair representations. The contributions of our paper are as follows: - -- We demonstrate the importance of bidirectional pre-training for language representations. Unlike [Radford et al.](#page-10-1) [(2018)](#page-10-1), which uses unidirectional language models for pre-training, BERT uses masked language models to enable pretrained deep bidirectional representations. This is also in contrast to [Peters et al.](#page-10-0) [(2018a)](#page-10-0), which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs. -- We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures. BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level *and* token-level tasks, outperforming many task-specific architectures. -- BERT advances the state of the art for eleven NLP tasks. The code and pre-trained models are available at [https://github.com/](https://github.com/google-research/bert) [google-research/bert](https://github.com/google-research/bert). - -### 2 Related Work - -There is a long history of pre-training general language representations, and we briefly review the most widely-used approaches in this section. - -#### 2.1 Unsupervised Feature-based Approaches - -Learning widely applicable representations of words has been an active area of research for decades, including non-neural [(Brown et al.,](#page-9-4) [1992;](#page-9-4) [Ando and Zhang,](#page-9-5) [2005;](#page-9-5) [Blitzer et al.,](#page-9-6) [2006)](#page-9-6) and neural [(Mikolov et al.,](#page-10-6) [2013;](#page-10-6) [Pennington et al.,](#page-10-7) [2014)](#page-10-7) methods. Pre-trained word embeddings are an integral part of modern NLP systems, offering significant improvements over embeddings learned from scratch [(Turian et al.,](#page-10-8) [2010)](#page-10-8). To pretrain word embedding vectors, left-to-right language modeling objectives have been used [(Mnih](#page-10-9) [and Hinton,](#page-10-9) [2009)](#page-10-9), as well as objectives to discriminate correct from incorrect words in left and right context [(Mikolov et al.,](#page-10-6) [2013)](#page-10-6). - -These approaches have been generalized to coarser granularities, such as sentence embeddings [(Kiros et al.,](#page-10-10) [2015;](#page-10-10) [Logeswaran and Lee,](#page-10-11) [2018)](#page-10-11) or paragraph embeddings [(Le and Mikolov,](#page-10-12) [2014)](#page-10-12). To train sentence representations, prior work has used objectives to rank candidate next sentences [(Jernite et al.,](#page-9-7) [2017;](#page-9-7) [Logeswaran and](#page-10-11) [Lee,](#page-10-11) [2018)](#page-10-11), left-to-right generation of next sentence words given a representation of the previous sentence [(Kiros et al.,](#page-10-10) [2015)](#page-10-10), or denoising autoencoder derived objectives [(Hill et al.,](#page-9-8) [2016)](#page-9-8). - -ELMo and its predecessor [(Peters et al.,](#page-10-13) [2017,](#page-10-13) [2018a)](#page-10-0) generalize traditional word embedding research along a different dimension. They extract *context-sensitive* features from a left-to-right and a right-to-left language model. The contextual representation of each token is the concatenation of the left-to-right and right-to-left representations. When integrating contextual word embeddings with existing task-specific architectures, ELMo advances the state of the art for several major NLP benchmarks [(Peters et al.,](#page-10-0) [2018a)](#page-10-0) including question answering [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3), sentiment analysis [(Socher et al.,](#page-10-14) [2013)](#page-10-14), and named entity recognition [(Tjong Kim Sang and De Meulder,](#page-10-2) [2003)](#page-10-2). [Melamud et al.](#page-10-15) [(2016)](#page-10-15) proposed learning contextual representations through a task to predict a single word from both left and right context using LSTMs. Similar to ELMo, their model is feature-based and not deeply bidirectional. [Fedus](#page-9-9) [et al.](#page-9-9) [(2018)](#page-9-9) shows that the cloze task can be used to improve the robustness of text generation models. - -#### 2.2 Unsupervised Fine-tuning Approaches - -As with the feature-based approaches, the first works in this direction only pre-trained word embedding parameters from unlabeled text [(Col](#page-9-10)[lobert and Weston,](#page-9-10) [2008)](#page-9-10). - -More recently, sentence or document encoders which produce contextual token representations have been pre-trained from unlabeled text and fine-tuned for a supervised downstream task [(Dai](#page-9-0) [and Le,](#page-9-0) [2015;](#page-9-0) [Howard and Ruder,](#page-9-1) [2018;](#page-9-1) [Radford](#page-10-1) [et al.,](#page-10-1) [2018)](#page-10-1). The advantage of these approaches is that few parameters need to be learned from scratch. At least partly due to this advantage, OpenAI GPT [(Radford et al.,](#page-10-1) [2018)](#page-10-1) achieved previously state-of-the-art results on many sentencelevel tasks from the GLUE benchmark [(Wang](#page-10-16) [et al.,](#page-10-16) [2018a)](#page-10-16). Left-to-right language model- - -Figure 1: Overall pre-training and fine-tuning procedures for BERT. Apart from output layers, the same architectures are used in both pre-training and fine-tuning. The same pre-trained model parameters are used to initialize models for different down-stream tasks. During fine-tuning, all parameters are fine-tuned. [CLS] is a special symbol added in front of every input example, and [SEP] is a special separator token (e.g. separating questions/answers). - -ing and auto-encoder objectives have been used for pre-training such models [(Howard and Ruder,](#page-9-1) [2018;](#page-9-1) [Radford et al.,](#page-10-1) [2018;](#page-10-1) [Dai and Le,](#page-9-0) [2015)](#page-9-0). - -#### 2.3 Transfer Learning from Supervised Data - -There has also been work showing effective transfer from supervised tasks with large datasets, such as natural language inference [(Conneau et al.,](#page-9-11) [2017)](#page-9-11) and machine translation [(McCann et al.,](#page-10-17) [2017)](#page-10-17). Computer vision research has also demonstrated the importance of transfer learning from large pre-trained models, where an effective recipe is to fine-tune models pre-trained with ImageNet [(Deng et al.,](#page-9-12) [2009;](#page-9-12) [Yosinski et al.,](#page-11-1) [2014)](#page-11-1). - -### 3 BERT - -We introduce BERT and its detailed implementation in this section. There are two steps in our framework: *pre-training* and *fine-tuning*. During pre-training, the model is trained on unlabeled data over different pre-training tasks. For finetuning, the BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. Each downstream task has separate fine-tuned models, even though they are initialized with the same pre-trained parameters. The question-answering example in Figure [1](#page-2-0) will serve as a running example for this section. - -A distinctive feature of BERT is its unified architecture across different tasks. There is minimal difference between the pre-trained architecture and the final downstream architecture. - -Model Architecture BERT's model architecture is a multi-layer bidirectional Transformer encoder based on the original implementation described in [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) and released in the tensor2tensor library.[1](#page-2-1) Because the use of Transformers has become common and our implementation is almost identical to the original, we will omit an exhaustive background description of the model architecture and refer readers to [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) as well as excellent guides such as "The Annotated Transformer."[2](#page-2-2) - -In this work, we denote the number of layers (i.e., Transformer blocks) as L, the hidden size as H, and the number of self-attention heads as A. [3](#page-2-3) We primarily report results on two model sizes: BERTBASE (L=12, H=768, A=12, Total Parameters=110M) and BERTLARGE (L=24, H=1024, A=16, Total Parameters=340M). - -BERTBASE was chosen to have the same model size as OpenAI GPT for comparison purposes. Critically, however, the BERT Transformer uses bidirectional self-attention, while the GPT Transformer uses constrained self-attention where every token can only attend to context to its left.[4](#page-2-4) - -1 https://github.com/tensorflow/tensor2tensor 2 http://nlp.seas.harvard.edu/2018/04/03/attention.html 3 In all cases we set the feed-forward/filter size to be 4H, - -i.e., 3072 for the H = 768 and 4096 for the H = 1024. 4We note that in the literature the bidirectional Trans- - -Input/Output Representations To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a "sentence" can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A "sequence" refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. - -We use WordPiece embeddings [(Wu et al.,](#page-11-2) [2016)](#page-11-2) with a 30,000 token vocabulary. The first token of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence. We differentiate the sentences in two ways. First, we separate them with a special token ([SEP]). Second, we add a learned embedding to every token indicating whether it belongs to sentence A or sentence B. As shown in Figure [1,](#page-2-0) we denote input embedding as E, the final hidden vector of the special [CLS] token as C ∈ R H, and the final hidden vector for the i th input token as Ti ∈ R H. - -For a given token, its input representation is constructed by summing the corresponding token, segment, and position embeddings. A visualization of this construction can be seen in Figure [2.](#page-4-0) - -### 3.1 Pre-training BERT - -Unlike [Peters et al.](#page-10-0) [(2018a)](#page-10-0) and [Radford et al.](#page-10-1) [(2018)](#page-10-1), we do not use traditional left-to-right or right-to-left language models to pre-train BERT. Instead, we pre-train BERT using two unsupervised tasks, described in this section. This step is presented in the left part of Figure [1.](#page-2-0) - -Task #1: Masked LM Intuitively, it is reasonable to believe that a deep bidirectional model is strictly more powerful than either a left-to-right model or the shallow concatenation of a left-toright and a right-to-left model. Unfortunately, standard conditional language models can only be trained left-to-right *or* right-to-left, since bidirectional conditioning would allow each word to indirectly "see itself", and the model could trivially predict the target word in a multi-layered context. - -In order to train a deep bidirectional representation, we simply mask some percentage of the input tokens at random, and then predict those masked tokens. We refer to this procedure as a "masked LM" (MLM), although it is often referred to as a *Cloze* task in the literature [(Taylor,](#page-10-5) [1953)](#page-10-5). In this case, the final hidden vectors corresponding to the mask tokens are fed into an output softmax over the vocabulary, as in a standard LM. In all of our experiments, we mask 15% of all WordPiece tokens in each sequence at random. In contrast to denoising auto-encoders [(Vincent et al.,](#page-10-18) [2008)](#page-10-18), we only predict the masked words rather than reconstructing the entire input. - -Although this allows us to obtain a bidirectional pre-trained model, a downside is that we are creating a mismatch between pre-training and fine-tuning, since the [MASK] token does not appear during fine-tuning. To mitigate this, we do not always replace "masked" words with the actual [MASK] token. The training data generator chooses 15% of the token positions at random for prediction. If the i-th token is chosen, we replace the i-th token with (1) the [MASK] token 80% of the time (2) a random token 10% of the time (3) the unchanged i-th token 10% of the time. Then, Ti will be used to predict the original token with cross entropy loss. We compare variations of this procedure in Appendix [C.2.](#page-15-0) - -Task #2: Next Sentence Prediction (NSP) Many important downstream tasks such as Question Answering (QA) and Natural Language Inference (NLI) are based on understanding the *relationship* between two sentences, which is not directly captured by language modeling. In order to train a model that understands sentence relationships, we pre-train for a binarized *next sentence prediction* task that can be trivially generated from any monolingual corpus. Specifically, when choosing the sentences A and B for each pretraining example, 50% of the time B is the actual next sentence that follows A (labeled as IsNext), and 50% of the time it is a random sentence from the corpus (labeled as NotNext). As we show in Figure [1,](#page-2-0) C is used for next sentence prediction (NSP).[5](#page-3-0) Despite its simplicity, we demonstrate in Section [5.1](#page-7-0) that pre-training towards this task is very beneficial to both QA and NLI. [6](#page-3-1) - -former is often referred to as a "Transformer encoder" while the left-context-only version is referred to as a "Transformer decoder" since it can be used for text generation. - -5The final model achieves 97%-98% accuracy on NSP. - -6The vector C is not a meaningful sentence representation without fine-tuning, since it was trained with NSP. - -Figure 2: BERT input representation. The input embeddings are the sum of the token embeddings, the segmentation embeddings and the position embeddings. - -The NSP task is closely related to representationlearning objectives used in [Jernite et al.](#page-9-7) [(2017)](#page-9-7) and [Logeswaran and Lee](#page-10-11) [(2018)](#page-10-11). However, in prior work, only sentence embeddings are transferred to down-stream tasks, where BERT transfers all parameters to initialize end-task model parameters. - -Pre-training data The pre-training procedure largely follows the existing literature on language model pre-training. For the pre-training corpus we use the BooksCorpus (800M words) [(Zhu et al.,](#page-11-3) [2015)](#page-11-3) and English Wikipedia (2,500M words). For Wikipedia we extract only the text passages and ignore lists, tables, and headers. It is critical to use a document-level corpus rather than a shuffled sentence-level corpus such as the Billion Word Benchmark [(Chelba et al.,](#page-9-13) [2013)](#page-9-13) in order to extract long contiguous sequences. - -#### 3.2 Fine-tuning BERT - -Fine-tuning is straightforward since the selfattention mechanism in the Transformer allows BERT to model many downstream tasks whether they involve single text or text pairs—by swapping out the appropriate inputs and outputs. For applications involving text pairs, a common pattern is to independently encode text pairs before applying bidirectional cross attention, such as [Parikh et al.](#page-10-19) [(2016)](#page-10-19); [Seo et al.](#page-10-20) [(2017)](#page-10-20). BERT instead uses the self-attention mechanism to unify these two stages, as encoding a concatenated text pair with self-attention effectively includes *bidirectional* cross attention between two sentences. - -For each task, we simply plug in the taskspecific inputs and outputs into BERT and finetune all the parameters end-to-end. At the input, sentence A and sentence B from pre-training are analogous to (1) sentence pairs in paraphrasing, (2) hypothesis-premise pairs in entailment, (3) question-passage pairs in question answering, and (4) a degenerate text-∅ pair in text classification or sequence tagging. At the output, the token representations are fed into an output layer for tokenlevel tasks, such as sequence tagging or question answering, and the [CLS] representation is fed into an output layer for classification, such as entailment or sentiment analysis. - -Compared to pre-training, fine-tuning is relatively inexpensive. All of the results in the paper can be replicated in at most 1 hour on a single Cloud TPU, or a few hours on a GPU, starting from the exact same pre-trained model.[7](#page-4-1) We describe the task-specific details in the corresponding subsections of Section [4.](#page-4-2) More details can be found in Appendix [A.5.](#page-13-0) - -### 4 Experiments - -In this section, we present BERT fine-tuning results on 11 NLP tasks. - -#### 4.1 GLUE - -The General Language Understanding Evaluation (GLUE) benchmark [(Wang et al.,](#page-10-16) [2018a)](#page-10-16) is a collection of diverse natural language understanding tasks. Detailed descriptions of GLUE datasets are included in Appendix [B.1.](#page-13-1) - -To fine-tune on GLUE, we represent the input sequence (for single sentence or sentence pairs) as described in Section [3,](#page-2-5) and use the final hidden vector C ∈ R H corresponding to the first input token ([CLS]) as the aggregate representation. The only new parameters introduced during fine-tuning are classification layer weights W ∈ R K×H, where K is the number of labels. We compute a standard classification loss with C and W, i.e., log(softmax(CWT )). - -- 8 See (10) in . -7 For example, the BERT SQuAD model can be trained in around 30 minutes on a single Cloud TPU to achieve a Dev F1 score of 91.0%. - - - -| System | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | -|------------------|-------------|------|------|-------|------|-------|------|------|---------| -| | 392k | 363k | 108k | 67k | 8.5k | 5.7k | 3.5k | 2.5k | - | -| Pre-OpenAI SOTA | 80.6/80.1 | 66.1 | 82.3 | 93.2 | 35.0 | 81.0 | 86.0 | 61.7 | 74.0 | -| BiLSTM+ELMo+Attn | 76.4/76.1 | 64.8 | 79.8 | 90.4 | 36.0 | 73.3 | 84.9 | 56.8 | 71.0 | -| OpenAI GPT | 82.1/81.4 | 70.3 | 87.4 | 91.3 | 45.4 | 80.0 | 82.3 | 56.0 | 75.1 | -| BERTBASE | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | -| BERTLARGE | 86.7/85.9 | 72.1 | 92.7 | 94.9 | 60.5 | 86.5 | 89.3 | 70.1 | 82.1 | - -Table 1: GLUE Test results, scored by the evaluation server (). The number below each task denotes the number of training examples. The "Average" column is slightly different than the official GLUE score, since we exclude the problematic WNLI set.[8](#page-4-3) BERT and OpenAI GPT are singlemodel, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and accuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components. - -We use a batch size of 32 and fine-tune for 3 epochs over the data for all GLUE tasks. For each task, we selected the best fine-tuning learning rate (among 5e-5, 4e-5, 3e-5, and 2e-5) on the Dev set. Additionally, for BERTLARGE we found that finetuning was sometimes unstable on small datasets, so we ran several random restarts and selected the best model on the Dev set. With random restarts, we use the same pre-trained checkpoint but perform different fine-tuning data shuffling and classifier layer initialization.[9](#page-5-0) - -Results are presented in Table [1.](#page-5-1) Both BERTBASE and BERTLARGE outperform all systems on all tasks by a substantial margin, obtaining 4.5% and 7.0% respective average accuracy improvement over the prior state of the art. Note that BERTBASE and OpenAI GPT are nearly identical in terms of model architecture apart from the attention masking. For the largest and most widely reported GLUE task, MNLI, BERT obtains a 4.6% absolute accuracy improvement. On the official GLUE leaderboard[10](#page-5-2), BERTLARGE obtains a score of 80.5, compared to OpenAI GPT, which obtains 72.8 as of the date of writing. - -We find that BERTLARGE significantly outperforms BERTBASE across all tasks, especially those with very little training data. The effect of model size is explored more thoroughly in Section [5.2.](#page-7-1) - -#### 4.2 SQuAD v1.1 - -The Stanford Question Answering Dataset (SQuAD v1.1) is a collection of 100k crowdsourced question/answer pairs [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3). Given a question and a passage from Wikipedia containing the answer, the task is to predict the answer text span in the passage. - -As shown in Figure [1,](#page-2-0) in the question answering task, we represent the input question and passage as a single packed sequence, with the question using the A embedding and the passage using the B embedding. We only introduce a start vector S ∈ R H and an end vector E ∈ R H during fine-tuning. The probability of word i being the start of the answer span is computed as a dot product between Ti and S followed by a softmax over all of the words in the paragraph: Pi = e S·Ti P j e S·Tj . The analogous formula is used for the end of the answer span. The score of a candidate span from position i to position j is defined as S·Ti + E·Tj , and the maximum scoring span where j ≄ i is used as a prediction. The training objective is the sum of the log-likelihoods of the correct start and end positions. We fine-tune for 3 epochs with a learning rate of 5e-5 and a batch size of 32. - -Table [2](#page-6-0) shows top leaderboard entries as well as results from top published systems [(Seo et al.,](#page-10-20) [2017;](#page-10-20) [Clark and Gardner,](#page-9-14) [2018;](#page-9-14) [Peters et al.,](#page-10-0) [2018a;](#page-10-0) [Hu et al.,](#page-9-15) [2018)](#page-9-15). The top results from the SQuAD leaderboard do not have up-to-date public system descriptions available,[11](#page-5-3) and are allowed to use any public data when training their systems. We therefore use modest data augmentation in our system by first fine-tuning on TriviaQA [(Joshi](#page-10-21) [et al.,](#page-10-21) [2017)](#page-10-21) befor fine-tuning on SQuAD. - -Our best performing system outperforms the top leaderboard system by +1.5 F1 in ensembling and +1.3 F1 as a single system. In fact, our single BERT model outperforms the top ensemble system in terms of F1 score. Without TriviaQA fine- - -9The GLUE data set distribution does not include the Test labels, and we only made a single GLUE evaluation server submission for each of BERTBASE and BERTLARGE. - -10https://gluebenchmark.com/leaderboard - -11QANet is described in [Yu et al.](#page-11-4) [(2018)](#page-11-4), but the system has improved substantially after publication. - - - -| System | Dev | | Test | | | | | | -|------------------------------------------|------|------|------|------|--|--|--|--| -| | EM | F1 | EM | F1 | | | | | -| Top Leaderboard Systems (Dec 10th, 2018) | | | | | | | | | -| Human | - | - | 82.3 | 91.2 | | | | | -| #1 Ensemble - nlnet | - | - | 86.0 | 91.7 | | | | | -| #2 Ensemble - QANet | - | - | 84.5 | 90.5 | | | | | -| Published | | | | | | | | | -| BiDAF+ELMo (Single) | - | 85.6 | - | 85.8 | | | | | -| R.M. Reader (Ensemble) | 81.2 | 87.9 | 82.3 | 88.5 | | | | | -| Ours | | | | | | | | | -| BERTBASE (Single) | 80.8 | 88.5 | - | - | | | | | -| BERTLARGE (Single) | 84.1 | 90.9 | - | - | | | | | -| BERTLARGE (Ensemble) | 85.8 | 91.8 | - | - | | | | | -| BERTLARGE (Sgl.+TriviaQA) | 84.2 | 91.1 | 85.1 | 91.8 | | | | | -| BERTLARGE (Ens.+TriviaQA) | 86.2 | 92.2 | 87.4 | 93.2 | | | | | - -Table 2: SQuAD 1.1 results. The BERT ensemble is 7x systems which use different pre-training checkpoints and fine-tuning seeds. - - - -| System | Dev | | Test | | | | | | -|------------------------------------------|------|------|------|------|--|--|--|--| -| | EM | F1 | EM | F1 | | | | | -| Top Leaderboard Systems (Dec 10th, 2018) | | | | | | | | | -| Human | 86.3 | 89.0 | 86.9 | 89.5 | | | | | -| #1 Single - MIR-MRC (F-Net) | - | - | 74.8 | 78.0 | | | | | -| #2 Single - nlnet | - | - | 74.2 | 77.1 | | | | | -| Published | | | | | | | | | -| unet (Ensemble) | - | - | 71.4 | 74.9 | | | | | -| SLQA+ (Single) | - | | 71.4 | 74.4 | | | | | -| Ours | | | | | | | | | -| BERTLARGE (Single) | 78.7 | 81.9 | 80.0 | 83.1 | | | | | - -Table 3: SQuAD 2.0 results. We exclude entries that use BERT as one of their components. - -tuning data, we only lose 0.1-0.4 F1, still outperforming all existing systems by a wide margin.[12](#page-6-1) - -#### 4.3 SQuAD v2.0 - -The SQuAD 2.0 task extends the SQuAD 1.1 problem definition by allowing for the possibility that no short answer exists in the provided paragraph, making the problem more realistic. - -We use a simple approach to extend the SQuAD v1.1 BERT model for this task. We treat questions that do not have an answer as having an answer span with start and end at the [CLS] token. The probability space for the start and end answer span positions is extended to include the position of the [CLS] token. For prediction, we compare the score of the no-answer span: snull = S·C + E·C to the score of the best non-null span - - - -| System | Dev | Test | -|------------------------|------|------| -| ESIM+GloVe | 51.9 | 52.7 | -| ESIM+ELMo | 59.1 | 59.2 | -| OpenAI GPT | - | 78.0 | -| BERTBASE | 81.6 | - | -| BERTLARGE | 86.6 | 86.3 | -| Human (expert)† | - | 85.0 | -| Human (5 annotations)† | - | 88.0 | - -Table 4: SWAG Dev and Test accuracies. †Human performance is measured with 100 samples, as reported in the SWAG paper. - -sˆi,j = maxj≄iS·Ti + E·Tj . We predict a non-null answer when sˆi,j > snull + τ , where the threshold τ is selected on the dev set to maximize F1. We did not use TriviaQA data for this model. We fine-tuned for 2 epochs with a learning rate of 5e-5 and a batch size of 48. - -The results compared to prior leaderboard entries and top published work [(Sun et al.,](#page-10-22) [2018;](#page-10-22) [Wang et al.,](#page-11-5) [2018b)](#page-11-5) are shown in Table [3,](#page-6-2) excluding systems that use BERT as one of their components. We observe a +5.1 F1 improvement over the previous best system. - -#### 4.4 SWAG - -The Situations With Adversarial Generations (SWAG) dataset contains 113k sentence-pair completion examples that evaluate grounded commonsense inference [(Zellers et al.,](#page-11-6) [2018)](#page-11-6). Given a sentence, the task is to choose the most plausible continuation among four choices. - -When fine-tuning on the SWAG dataset, we construct four input sequences, each containing the concatenation of the given sentence (sentence A) and a possible continuation (sentence B). The only task-specific parameters introduced is a vector whose dot product with the [CLS] token representation C denotes a score for each choice which is normalized with a softmax layer. - -We fine-tune the model for 3 epochs with a learning rate of 2e-5 and a batch size of 16. Results are presented in Table [4.](#page-6-3) BERTLARGE outperforms the authors' baseline ESIM+ELMo system by +27.1% and OpenAI GPT by 8.3%. - -### 5 Ablation Studies - -In this section, we perform ablation experiments over a number of facets of BERT in order to better understand their relative importance. Additional - -12The TriviaQA data we used consists of paragraphs from TriviaQA-Wiki formed of the first 400 tokens in documents, that contain at least one of the provided possible answers. - - - -| | Dev Set | | | | | | | -|--------------|---------|-------|-------|-------|-------|--|--| -| Tasks | MNLI-m | QNLI | MRPC | SST-2 | SQuAD | | | -| | (Acc) | (Acc) | (Acc) | (Acc) | (F1) | | | -| BERTBASE | 84.4 | 88.4 | 86.7 | 92.7 | 88.5 | | | -| No NSP | 83.9 | 84.9 | 86.5 | 92.6 | 87.9 | | | -| LTR & No NSP | 82.1 | 84.3 | 77.5 | 92.1 | 77.8 | | | -| + BiLSTM | 82.1 | 84.1 | 75.7 | 91.6 | 84.9 | | | - -Table 5: Ablation over the pre-training tasks using the BERTBASE architecture. "No NSP" is trained without the next sentence prediction task. "LTR & No NSP" is trained as a left-to-right LM without the next sentence prediction, like OpenAI GPT. "+ BiLSTM" adds a randomly initialized BiLSTM on top of the "LTR + No NSP" model during fine-tuning. - -ablation studies can be found in Appendix [C.](#page-15-1) - -### 5.1 Effect of Pre-training Tasks - -We demonstrate the importance of the deep bidirectionality of BERT by evaluating two pretraining objectives using exactly the same pretraining data, fine-tuning scheme, and hyperparameters as BERTBASE: - -No NSP: A bidirectional model which is trained using the "masked LM" (MLM) but without the "next sentence prediction" (NSP) task. - -LTR & No NSP: A left-context-only model which is trained using a standard Left-to-Right (LTR) LM, rather than an MLM. The left-only constraint was also applied at fine-tuning, because removing it introduced a pre-train/fine-tune mismatch that degraded downstream performance. Additionally, this model was pre-trained without the NSP task. This is directly comparable to OpenAI GPT, but using our larger training dataset, our input representation, and our fine-tuning scheme. - -We first examine the impact brought by the NSP task. In Table [5,](#page-7-2) we show that removing NSP hurts performance significantly on QNLI, MNLI, and SQuAD 1.1. Next, we evaluate the impact of training bidirectional representations by comparing "No NSP" to "LTR & No NSP". The LTR model performs worse than the MLM model on all tasks, with large drops on MRPC and SQuAD. - -For SQuAD it is intuitively clear that a LTR model will perform poorly at token predictions, since the token-level hidden states have no rightside context. In order to make a good faith attempt at strengthening the LTR system, we added a randomly initialized BiLSTM on top. This does significantly improve results on SQuAD, but the results are still far worse than those of the pretrained bidirectional models. The BiLSTM hurts performance on the GLUE tasks. - -We recognize that it would also be possible to train separate LTR and RTL models and represent each token as the concatenation of the two models, as ELMo does. However: (a) this is twice as expensive as a single bidirectional model; (b) this is non-intuitive for tasks like QA, since the RTL model would not be able to condition the answer on the question; (c) this it is strictly less powerful than a deep bidirectional model, since it can use both left and right context at every layer. - -### 5.2 Effect of Model Size - -In this section, we explore the effect of model size on fine-tuning task accuracy. We trained a number of BERT models with a differing number of layers, hidden units, and attention heads, while otherwise using the same hyperparameters and training procedure as described previously. - -Results on selected GLUE tasks are shown in Table [6.](#page-8-0) In this table, we report the average Dev Set accuracy from 5 random restarts of fine-tuning. We can see that larger models lead to a strict accuracy improvement across all four datasets, even for MRPC which only has 3,600 labeled training examples, and is substantially different from the pre-training tasks. It is also perhaps surprising that we are able to achieve such significant improvements on top of models which are already quite large relative to the existing literature. For example, the largest Transformer explored in [Vaswani et al.](#page-10-4) [(2017)](#page-10-4) is (L=6, H=1024, A=16) with 100M parameters for the encoder, and the largest Transformer we have found in the literature is (L=64, H=512, A=2) with 235M parameters [(Al-Rfou et al.,](#page-9-16) [2018)](#page-9-16). By contrast, BERTBASE contains 110M parameters and BERTLARGE contains 340M parameters. - -It has long been known that increasing the model size will lead to continual improvements on large-scale tasks such as machine translation and language modeling, which is demonstrated by the LM perplexity of held-out training data shown in Table [6.](#page-8-0) However, we believe that this is the first work to demonstrate convincingly that scaling to extreme model sizes also leads to large improvements on very small scale tasks, provided that the model has been sufficiently pre-trained. [Peters et al.](#page-10-23) [(2018b)](#page-10-23) presented mixed results on the downstream task impact of increasing the pre-trained bi-LM size from two to four layers and [Melamud et al.](#page-10-15) [(2016)](#page-10-15) mentioned in passing that increasing hidden dimension size from 200 to 600 helped, but increasing further to 1,000 did not bring further improvements. Both of these prior works used a featurebased approach — we hypothesize that when the model is fine-tuned directly on the downstream tasks and uses only a very small number of randomly initialized additional parameters, the taskspecific models can benefit from the larger, more expressive pre-trained representations even when downstream task data is very small. - -#### 5.3 Feature-based Approach with BERT - -All of the BERT results presented so far have used the fine-tuning approach, where a simple classification layer is added to the pre-trained model, and all parameters are jointly fine-tuned on a downstream task. However, the feature-based approach, where fixed features are extracted from the pretrained model, has certain advantages. First, not all tasks can be easily represented by a Transformer encoder architecture, and therefore require a task-specific model architecture to be added. Second, there are major computational benefits to pre-compute an expensive representation of the training data once and then run many experiments with cheaper models on top of this representation. - -In this section, we compare the two approaches by applying BERT to the CoNLL-2003 Named Entity Recognition (NER) task [(Tjong Kim Sang](#page-10-2) [and De Meulder,](#page-10-2) [2003)](#page-10-2). In the input to BERT, we use a case-preserving WordPiece model, and we include the maximal document context provided by the data. Following standard practice, we formulate this as a tagging task but do not use a CRF - - - -| | Hyperparams | | | Dev Set Accuracy | | | | | -|-------------------------|----------------------------------|---------------------------|--------------------------------------|--------------------------------------|--------------------------------------|--------------------------------------|--|--| -| #L | #H | #A | LM (ppl) | MNLI-m | MRPC | SST-2 | | | -| 3
6
6
12
12 | 768
768
768
768
1024 | 12
3
12
12
16 | 5.84
5.24
4.68
3.99
3.54 | 77.9
80.6
81.9
84.4
85.7 | 79.8
82.2
84.8
86.7
86.9 | 88.4
90.7
91.3
92.9
93.3 | | | -| 24 | 1024 | 16 | 3.23 | 86.6 | 87.8 | 93.7 | | | - -Table 6: Ablation over BERT model size. #L = the number of layers; #H = hidden size; #A = number of attention heads. "LM (ppl)" is the masked LM perplexity of held-out training data. - - - -| System | Dev F1 | Test F1 | -|-----------------------------------|--------|---------| -| ELMo (Peters et al., 2018a) | 95.7 | 92.2 | -| CVT (Clark et al., 2018) | - | 92.6 | -| CSE (Akbik et al., 2018) | - | 93.1 | -| Fine-tuning approach | | | -| BERTLARGE | 96.6 | 92.8 | -| BERTBASE | 96.4 | 92.4 | -| Feature-based approach (BERTBASE) | | | -| Embeddings | 91.0 | - | -| Second-to-Last Hidden | 95.6 | - | -| Last Hidden | 94.9 | - | -| Weighted Sum Last Four Hidden | 95.9 | - | -| Concat Last Four Hidden | 96.1 | - | -| Weighted Sum All 12 Layers | 95.5 | - | - -Table 7: CoNLL-2003 Named Entity Recognition results. Hyperparameters were selected using the Dev set. The reported Dev and Test scores are averaged over 5 random restarts using those hyperparameters. - -layer in the output. We use the representation of the first sub-token as the input to the token-level classifier over the NER label set. - -To ablate the fine-tuning approach, we apply the feature-based approach by extracting the activations from one or more layers *without* fine-tuning any parameters of BERT. These contextual embeddings are used as input to a randomly initialized two-layer 768-dimensional BiLSTM before the classification layer. - -Results are presented in Table [7.](#page-8-1) BERTLARGE performs competitively with state-of-the-art methods. The best performing method concatenates the token representations from the top four hidden layers of the pre-trained Transformer, which is only 0.3 F1 behind fine-tuning the entire model. This demonstrates that BERT is effective for both finetuning and feature-based approaches. - -### 6 Conclusion - -Recent empirical improvements due to transfer learning with language models have demonstrated that rich, unsupervised pre-training is an integral part of many language understanding systems. In particular, these results enable even low-resource tasks to benefit from deep unidirectional architectures. Our major contribution is further generalizing these findings to deep *bidirectional* architectures, allowing the same pre-trained model to successfully tackle a broad set of NLP tasks. - -### References - -- Alan Akbik, Duncan Blythe, and Roland Vollgraf. 2018. Contextual string embeddings for sequence labeling. In *Proceedings of the 27th International Conference on Computational Linguistics*, pages 1638–1649. -- Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, and Llion Jones. 2018. Character-level language modeling with deeper self-attention. *arXiv preprint arXiv:1808.04444*. -- Rie Kubota Ando and Tong Zhang. 2005. A framework for learning predictive structures from multiple tasks and unlabeled data. *Journal of Machine Learning Research*, 6(Nov):1817–1853. -- Luisa Bentivogli, Bernardo Magnini, Ido Dagan, Hoa Trang Dang, and Danilo Giampiccolo. 2009. The fifth PASCAL recognizing textual entailment challenge. In *TAC*. NIST. -- John Blitzer, Ryan McDonald, and Fernando Pereira. 2006. Domain adaptation with structural correspondence learning. In *Proceedings of the 2006 conference on empirical methods in natural language processing*, pages 120–128. Association for Computational Linguistics. -- Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In *EMNLP*. Association for Computational Linguistics. -- Peter F Brown, Peter V Desouza, Robert L Mercer, Vincent J Della Pietra, and Jenifer C Lai. 1992. Class-based n-gram models of natural language. *Computational linguistics*, 18(4):467–479. -- Daniel Cer, Mona Diab, Eneko Agirre, Inigo Lopez-Gazpio, and Lucia Specia. 2017. [Semeval-2017](https://doi.org/10.18653/v1/S17-2001) [task 1: Semantic textual similarity multilingual and](https://doi.org/10.18653/v1/S17-2001) [crosslingual focused evaluation.](https://doi.org/10.18653/v1/S17-2001) In *Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)*, pages 1–14, Vancouver, Canada. Association for Computational Linguistics. -- Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, Thorsten Brants, Phillipp Koehn, and Tony Robinson. 2013. One billion word benchmark for measuring progress in statistical language modeling. *arXiv preprint arXiv:1312.3005*. -- Z. Chen, H. Zhang, X. Zhang, and L. Zhao. 2018. [Quora question pairs.](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) -- Christopher Clark and Matt Gardner. 2018. Simple and effective multi-paragraph reading comprehension. In *ACL*. -- Kevin Clark, Minh-Thang Luong, Christopher D Manning, and Quoc Le. 2018. Semi-supervised sequence modeling with cross-view training. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing*, pages 1914– 1925. -- Ronan Collobert and Jason Weston. 2008. A unified architecture for natural language processing: Deep neural networks with multitask learning. In *Proceedings of the 25th international conference on Machine learning*, pages 160–167. ACM. -- Alexis Conneau, Douwe Kiela, Holger Schwenk, Lošıc Barrault, and Antoine Bordes. 2017. [Supervised](https://www.aclweb.org/anthology/D17-1070) [learning of universal sentence representations from](https://www.aclweb.org/anthology/D17-1070) [natural language inference data.](https://www.aclweb.org/anthology/D17-1070) In *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing*, pages 670–680, Copenhagen, Denmark. Association for Computational Linguistics. -- Andrew M Dai and Quoc V Le. 2015. Semi-supervised sequence learning. In *Advances in neural information processing systems*, pages 3079–3087. -- J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In *CVPR09*. -- William B Dolan and Chris Brockett. 2005. Automatically constructing a corpus of sentential paraphrases. In *Proceedings of the Third International Workshop on Paraphrasing (IWP2005)*. -- William Fedus, Ian Goodfellow, and Andrew M Dai. 2018. Maskgan: Better text generation via filling in the . *arXiv preprint arXiv:1801.07736*. -- Dan Hendrycks and Kevin Gimpel. 2016. [Bridging](http://arxiv.org/abs/1606.08415) [nonlinearities and stochastic regularizers with gaus](http://arxiv.org/abs/1606.08415)[sian error linear units.](http://arxiv.org/abs/1606.08415) *CoRR*, abs/1606.08415. -- Felix Hill, Kyunghyun Cho, and Anna Korhonen. 2016. Learning distributed representations of sentences from unlabelled data. In *Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies*. Association for Computational Linguistics. -- Jeremy Howard and Sebastian Ruder. 2018. [Universal](http://arxiv.org/abs/1801.06146) [language model fine-tuning for text classification.](http://arxiv.org/abs/1801.06146) In *ACL*. Association for Computational Linguistics. -- Minghao Hu, Yuxing Peng, Zhen Huang, Xipeng Qiu, Furu Wei, and Ming Zhou. 2018. Reinforced mnemonic reader for machine reading comprehension. In *IJCAI*. -- Yacine Jernite, Samuel R. Bowman, and David Sontag. 2017. [Discourse-based objectives for fast un](http://arxiv.org/abs/1705.00557)[supervised sentence representation learning.](http://arxiv.org/abs/1705.00557) *CoRR*, abs/1705.00557. -- Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. In *ACL*. -- Ryan Kiros, Yukun Zhu, Ruslan R Salakhutdinov, Richard Zemel, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Skip-thought vectors. In *Advances in neural information processing systems*, pages 3294–3302. -- Quoc Le and Tomas Mikolov. 2014. Distributed representations of sentences and documents. In *International Conference on Machine Learning*, pages 1188–1196. -- Hector J Levesque, Ernest Davis, and Leora Morgenstern. 2011. The winograd schema challenge. In *Aaai spring symposium: Logical formalizations of commonsense reasoning*, volume 46, page 47. -- Lajanugen Logeswaran and Honglak Lee. 2018. [An](https://openreview.net/forum?id=rJvJXZb0W) [efficient framework for learning sentence represen](https://openreview.net/forum?id=rJvJXZb0W)[tations.](https://openreview.net/forum?id=rJvJXZb0W) In *International Conference on Learning Representations*. -- Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. 2017. Learned in translation: Contextualized word vectors. In *NIPS*. -- Oren Melamud, Jacob Goldberger, and Ido Dagan. 2016. context2vec: Learning generic context embedding with bidirectional LSTM. In *CoNLL*. -- Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg S Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In *Advances in Neural Information Processing Systems 26*, pages 3111–3119. Curran Associates, Inc. -- Andriy Mnih and Geoffrey E Hinton. 2009. [A scal](http://papers.nips.cc/paper/3583-a-scalable-hierarchical-distributed-language-model.pdf)[able hierarchical distributed language model.](http://papers.nips.cc/paper/3583-a-scalable-hierarchical-distributed-language-model.pdf) In D. Koller, D. Schuurmans, Y. Bengio, and L. Bottou, editors, *Advances in Neural Information Processing Systems 21*, pages 1081–1088. Curran Associates, Inc. -- Ankur P Parikh, Oscar Tackstr š om, Dipanjan Das, and š Jakob Uszkoreit. 2016. A decomposable attention model for natural language inference. In *EMNLP*. -- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [Glove: Global vectors for](http://www.aclweb.org/anthology/D14-1162) [word representation.](http://www.aclweb.org/anthology/D14-1162) In *Empirical Methods in Natural Language Processing (EMNLP)*, pages 1532– 1543. -- Matthew Peters, Waleed Ammar, Chandra Bhagavatula, and Russell Power. 2017. Semi-supervised sequence tagging with bidirectional language models. In *ACL*. -- Matthew Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018a. Deep contextualized word representations. In *NAACL*. -- Matthew Peters, Mark Neumann, Luke Zettlemoyer, and Wen-tau Yih. 2018b. Dissecting contextual word embeddings: Architecture and representation. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing*, pages 1499–1509. -- Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language understanding with unsupervised learning. Technical report, OpenAI. -- Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000+ questions for machine comprehension of text. In *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, pages 2383–2392. -- Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. 2017. Bidirectional attention flow for machine comprehension. In *ICLR*. -- Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher D Manning, Andrew Ng, and Christopher Potts. 2013. Recursive deep models for semantic compositionality over a sentiment treebank. In *Proceedings of the 2013 conference on empirical methods in natural language processing*, pages 1631–1642. -- Fu Sun, Linyang Li, Xipeng Qiu, and Yang Liu. 2018. U-net: Machine reading comprehension with unanswerable questions. *arXiv preprint arXiv:1810.06638*. -- Wilson L Taylor. 1953. Cloze procedure: A new tool for measuring readability. *Journalism Bulletin*, 30(4):415–433. -- Erik F Tjong Kim Sang and Fien De Meulder. 2003. Introduction to the conll-2003 shared task: Language-independent named entity recognition. In *CoNLL*. -- Joseph Turian, Lev Ratinov, and Yoshua Bengio. 2010. Word representations: A simple and general method for semi-supervised learning. In *Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics*, ACL '10, pages 384–394. -- Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In *Advances in Neural Information Processing Systems*, pages 6000–6010. -- Pascal Vincent, Hugo Larochelle, Yoshua Bengio, and Pierre-Antoine Manzagol. 2008. Extracting and composing robust features with denoising autoencoders. In *Proceedings of the 25th international conference on Machine learning*, pages 1096–1103. ACM. -- Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel Bowman. 2018a. Glue: A multi-task benchmark and analysis platform - -for natural language understanding. In *Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP*, pages 353–355. - -- Wei Wang, Ming Yan, and Chen Wu. 2018b. Multigranularity hierarchical attention fusion networks for reading comprehension and question answering. In *Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*. Association for Computational Linguistics. -- Alex Warstadt, Amanpreet Singh, and Samuel R Bowman. 2018. Neural network acceptability judgments. *arXiv preprint arXiv:1805.12471*. -- Adina Williams, Nikita Nangia, and Samuel R Bowman. 2018. A broad-coverage challenge corpus for sentence understanding through inference. In *NAACL*. -- Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. *arXiv preprint arXiv:1609.08144*. -- Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks? In *Advances in neural information processing systems*, pages 3320–3328. -- Adams Wei Yu, David Dohan, Minh-Thang Luong, Rui Zhao, Kai Chen, Mohammad Norouzi, and Quoc V Le. 2018. QANet: Combining local convolution with global self-attention for reading comprehension. In *ICLR*. -- Rowan Zellers, Yonatan Bisk, Roy Schwartz, and Yejin Choi. 2018. Swag: A large-scale adversarial dataset for grounded commonsense inference. In *Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)*. -- Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2015. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In *Proceedings of the IEEE international conference on computer vision*, pages 19–27. - -# Appendix for "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" - -We organize the appendix into three sections: - -- Additional implementation details for BERT are presented in Appendix [A;](#page-11-7) -- Additional details for our experiments are presented in Appendix [B;](#page-13-2) and -- Additional ablation studies are presented in Appendix [C.](#page-15-1) - -We present additional ablation studies for BERT including: - -- Effect of Number of Training Steps; and -- Ablation for Different Masking Procedures. - -### A Additional Details for BERT - -### A.1 Illustration of the Pre-training Tasks - -We provide examples of the pre-training tasks in the following. - -Masked LM and the Masking Procedure Assuming the unlabeled sentence is my dog is hairy, and during the random masking procedure we chose the 4-th token (which corresponding to hairy), our masking procedure can be further illustrated by - -- 80% of the time: Replace the word with the [MASK] token, e.g., my dog is hairy → my dog is [MASK] -- 10% of the time: Replace the word with a random word, e.g., my dog is hairy → my dog is apple -- 10% of the time: Keep the word unchanged, e.g., my dog is hairy → my dog is hairy. The purpose of this is to bias the representation towards the actual observed word. - -The advantage of this procedure is that the Transformer encoder does not know which words it will be asked to predict or which have been replaced by random words, so it is forced to keep a distributional contextual representation of *every* input token. Additionally, because random replacement only occurs for 1.5% of all tokens (i.e., 10% of 15%), this does not seem to harm the model's language understanding capability. In Section [C.2,](#page-15-0) we evaluate the impact this procedure. - -Compared to standard langauge model training, the masked LM only make predictions on 15% of tokens in each batch, which suggests that more pre-training steps may be required for the model - -Figure 3: Differences in pre-training model architectures. BERT uses a bidirectional Transformer. OpenAI GPT uses a left-to-right Transformer. ELMo uses the concatenation of independently trained left-to-right and right-toleft LSTMs to generate features for downstream tasks. Among the three, only BERT representations are jointly conditioned on both left and right context in all layers. In addition to the architecture differences, BERT and OpenAI GPT are fine-tuning approaches, while ELMo is a feature-based approach. - -to converge. In Section [C.1](#page-15-2) we demonstrate that MLM does converge marginally slower than a leftto-right model (which predicts every token), but the empirical improvements of the MLM model far outweigh the increased training cost. - -Next Sentence Prediction The next sentence prediction task can be illustrated in the following examples. - -Input = [CLS] the man went to [MASK] store [SEP] he bought a gallon [MASK] milk [SEP] Label = IsNext - -Input = [CLS] the man [MASK] to the store [SEP] penguin [MASK] are flight ##less birds [SEP] Label = NotNext - -#### A.2 Pre-training Procedure - -To generate each training input sequence, we sample two spans of text from the corpus, which we refer to as "sentences" even though they are typically much longer than single sentences (but can be shorter also). The first sentence receives the A embedding and the second receives the B embedding. 50% of the time B is the actual next sentence that follows A and 50% of the time it is a random sentence, which is done for the "next sentence prediction" task. They are sampled such that the combined length is ≀ 512 tokens. The LM masking is applied after WordPiece tokenization with a uniform masking rate of 15%, and no special consideration given to partial word pieces. - -We train with batch size of 256 sequences (256 sequences * 512 tokens = 128,000 tokens/batch) for 1,000,000 steps, which is approximately 40 epochs over the 3.3 billion word corpus. We use Adam with learning rate of 1e-4, ÎČ1 = 0.9, ÎČ2 = 0.999, L2 weight decay of 0.01, learning rate warmup over the first 10,000 steps, and linear decay of the learning rate. We use a dropout probability of 0.1 on all layers. We use a gelu activation [(Hendrycks and Gimpel,](#page-9-19) [2016)](#page-9-19) rather than the standard relu, following OpenAI GPT. The training loss is the sum of the mean masked LM likelihood and the mean next sentence prediction likelihood. - -Training of BERTBASE was performed on 4 Cloud TPUs in Pod configuration (16 TPU chips total).[13](#page-12-0) Training of BERTLARGE was performed on 16 Cloud TPUs (64 TPU chips total). Each pretraining took 4 days to complete. - -Longer sequences are disproportionately expensive because attention is quadratic to the sequence length. To speed up pretraing in our experiments, we pre-train the model with sequence length of 128 for 90% of the steps. Then, we train the rest 10% of the steps of sequence of 512 to learn the positional embeddings. - -#### A.3 Fine-tuning Procedure - -For fine-tuning, most model hyperparameters are the same as in pre-training, with the exception of the batch size, learning rate, and number of training epochs. The dropout probability was always kept at 0.1. The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks: - -‱ Batch size: 16, 32 - -13https://cloudplatform.googleblog.com/2018/06/Cloud-TPU-now-offers-preemptible-pricing-and-globalavailability.html - -‱ Learning rate (Adam): 5e-5, 3e-5, 2e-5 - -‱ Number of epochs: 2, 3, 4 - -We also observed that large data sets (e.g., 100k+ labeled training examples) were far less sensitive to hyperparameter choice than small data sets. Fine-tuning is typically very fast, so it is reasonable to simply run an exhaustive search over the above parameters and choose the model that performs best on the development set. - -# A.4 Comparison of BERT, ELMo ,and OpenAI GPT - -Here we studies the differences in recent popular representation learning models including ELMo, OpenAI GPT and BERT. The comparisons between the model architectures are shown visually in Figure [3.](#page-12-1) Note that in addition to the architecture differences, BERT and OpenAI GPT are finetuning approaches, while ELMo is a feature-based approach. - -The most comparable existing pre-training method to BERT is OpenAI GPT, which trains a left-to-right Transformer LM on a large text corpus. In fact, many of the design decisions in BERT were intentionally made to make it as close to GPT as possible so that the two methods could be minimally compared. The core argument of this work is that the bi-directionality and the two pretraining tasks presented in Section [3.1](#page-3-2) account for the majority of the empirical improvements, but we do note that there are several other differences between how BERT and GPT were trained: - -- GPT is trained on the BooksCorpus (800M words); BERT is trained on the BooksCorpus (800M words) and Wikipedia (2,500M words). -- GPT uses a sentence separator ([SEP]) and classifier token ([CLS]) which are only introduced at fine-tuning time; BERT learns [SEP], [CLS] and sentence A/B embeddings during pre-training. -- GPT was trained for 1M steps with a batch size of 32,000 words; BERT was trained for 1M steps with a batch size of 128,000 words. -- GPT used the same learning rate of 5e-5 for all fine-tuning experiments; BERT chooses a task-specific fine-tuning learning rate which performs the best on the development set. - -To isolate the effect of these differences, we perform ablation experiments in Section [5.1](#page-7-0) which demonstrate that the majority of the improvements are in fact coming from the two pre-training tasks and the bidirectionality they enable. - -# A.5 Illustrations of Fine-tuning on Different Tasks - -The illustration of fine-tuning BERT on different tasks can be seen in Figure [4.](#page-14-0) Our task-specific models are formed by incorporating BERT with one additional output layer, so a minimal number of parameters need to be learned from scratch. Among the tasks, (a) and (b) are sequence-level tasks while (c) and (d) are token-level tasks. In the figure, E represents the input embedding, Ti represents the contextual representation of token i, [CLS] is the special symbol for classification output, and [SEP] is the special symbol to separate non-consecutive token sequences. - -# B Detailed Experimental Setup - -# B.1 Detailed Descriptions for the GLUE Benchmark Experiments. - -Our GLUE results in Tabl[e1](#page-5-1) are obtained from [https://gluebenchmark.com/](https://gluebenchmark.com/leaderboard) [leaderboard](https://gluebenchmark.com/leaderboard) and [https://blog.](https://blog.openai.com/language-unsupervised) [openai.com/language-unsupervised](https://blog.openai.com/language-unsupervised). The GLUE benchmark includes the following datasets, the descriptions of which were originally summarized in [Wang et al.](#page-10-16) [(2018a)](#page-10-16): - -MNLI Multi-Genre Natural Language Inference is a large-scale, crowdsourced entailment classification task [(Williams et al.,](#page-11-0) [2018)](#page-11-0). Given a pair of sentences, the goal is to predict whether the second sentence is an *entailment*, *contradiction*, or *neutral* with respect to the first one. - -QQP Quora Question Pairs is a binary classification task where the goal is to determine if two questions asked on Quora are semantically equivalent [(Chen et al.,](#page-9-20) [2018)](#page-9-20). - -QNLI Question Natural Language Inference is a version of the Stanford Question Answering Dataset [(Rajpurkar et al.,](#page-10-3) [2016)](#page-10-3) which has been converted to a binary classification task [(Wang](#page-10-16) [et al.,](#page-10-16) [2018a)](#page-10-16). The positive examples are (question, sentence) pairs which do contain the correct answer, and the negative examples are (question, sentence) from the same paragraph which do not contain the answer. - -Figure 4: Illustrations of Fine-tuning BERT on Different Tasks. - -SST-2 The Stanford Sentiment Treebank is a binary single-sentence classification task consisting of sentences extracted from movie reviews with human annotations of their sentiment [(Socher](#page-10-14) [et al.,](#page-10-14) [2013)](#page-10-14). - -CoLA The Corpus of Linguistic Acceptability is a binary single-sentence classification task, where the goal is to predict whether an English sentence is linguistically "acceptable" or not [(Warstadt](#page-11-8) [et al.,](#page-11-8) [2018)](#page-11-8). - -STS-B The Semantic Textual Similarity Benchmark is a collection of sentence pairs drawn from news headlines and other sources [(Cer et al.,](#page-9-21) [2017)](#page-9-21). They were annotated with a score from 1 to 5 denoting how similar the two sentences are in terms of semantic meaning. - -MRPC Microsoft Research Paraphrase Corpus consists of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent [(Dolan and Brockett,](#page-9-3) [2005)](#page-9-3). - -RTE Recognizing Textual Entailment is a binary entailment task similar to MNLI, but with much less training data [(Bentivogli et al.,](#page-9-22) [2009)](#page-9-22).[14](#page-14-1) - -WNLI Winograd NLI is a small natural language inference dataset [(Levesque et al.,](#page-10-24) [2011)](#page-10-24). The GLUE webpage notes that there are issues with the construction of this dataset, [15](#page-14-2) and every trained system that's been submitted to GLUE has performed worse than the 65.1 baseline accuracy of predicting the majority class. We therefore exclude this set to be fair to OpenAI GPT. For our GLUE submission, we always predicted the ma- - -14Note that we only report single-task fine-tuning results in this paper. A multitask fine-tuning approach could potentially push the performance even further. For example, we did observe substantial improvements on RTE from multitask training with MNLI. - -15 - -jority class. - -### C Additional Ablation Studies - -#### C.1 Effect of Number of Training Steps - -Figure [5](#page-15-3) presents MNLI Dev accuracy after finetuning from a checkpoint that has been pre-trained for k steps. This allows us to answer the following questions: - -- 1. Question: Does BERT really need such a large amount of pre-training (128,000 words/batch * 1,000,000 steps) to achieve high fine-tuning accuracy? -Answer: Yes, BERTBASE achieves almost 1.0% additional accuracy on MNLI when trained on 1M steps compared to 500k steps. - -- 2. Question: Does MLM pre-training converge slower than LTR pre-training, since only 15% of words are predicted in each batch rather than every word? -Answer: The MLM model does converge slightly slower than the LTR model. However, in terms of absolute accuracy the MLM model begins to outperform the LTR model almost immediately. - -### C.2 Ablation for Different Masking Procedures - -In Section [3.1,](#page-3-2) we mention that BERT uses a mixed strategy for masking the target tokens when pre-training with the masked language model (MLM) objective. The following is an ablation study to evaluate the effect of different masking strategies. - -Figure 5: Ablation over number of training steps. This shows the MNLI accuracy after fine-tuning, starting from model parameters that have been pre-trained for k steps. The x-axis is the value of k. - -Note that the purpose of the masking strategies is to reduce the mismatch between pre-training and fine-tuning, as the [MASK] symbol never appears during the fine-tuning stage. We report the Dev results for both MNLI and NER. For NER, we report both fine-tuning and feature-based approaches, as we expect the mismatch will be amplified for the feature-based approach as the model will not have the chance to adjust the representations. - - - -| Masking Rates | | | Dev Set Results | | | | -|---------------|------|------|-------------------|-----------------------------------|------|--| -| MASK | SAME | RND | MNLI
Fine-tune | NER
Fine-tune
Feature-based | | | -| 80% | 10% | 10% | 84.2 | 95.4 | 94.9 | | -| 100% | 0% | 0% | 84.3 | 94.9 | 94.0 | | -| 80% | 0% | 20% | 84.1 | 95.2 | 94.6 | | -| 80% | 20% | 0% | 84.4 | 95.2 | 94.7 | | -| 0% | 20% | 80% | 83.7 | 94.8 | 94.6 | | -| 0% | 0% | 100% | 83.6 | 94.9 | 94.6 | | - -Table 8: Ablation over different masking strategies. - -The results are presented in Table [8.](#page-15-4) In the table, MASK means that we replace the target token with the [MASK] symbol for MLM; SAME means that we keep the target token as is; RND means that we replace the target token with another random token. - -The numbers in the left part of the table represent the probabilities of the specific strategies used during MLM pre-training (BERT uses 80%, 10%, 10%). The right part of the paper represents the Dev set results. For the feature-based approach, we concatenate the last 4 layers of BERT as the features, which was shown to be the best approach in Section [5.3.](#page-8-2) - -From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well. diff --git a/examples/custom_output_files/files/rfc8259.md b/examples/custom_output_files/files/rfc8259.md deleted file mode 100644 index bf6c2941..00000000 --- a/examples/custom_output_files/files/rfc8259.md +++ /dev/null @@ -1,362 +0,0 @@ -Internet Engineering Task Force (IETF) T. Bray, Ed. Request for Comments: 8259 Textuality Obsoletes: 7159 December 2017 Category: Standards Track ISSN: 2070-1721 - -The JavaScript Object Notation (JSON) Data Interchange Format - -Abstract - - JavaScript Object Notation (JSON) is a lightweight, text-based, language-independent data interchange format. It was derived from the ECMAScript Programming Language Standard. JSON defines a small set of formatting rules for the portable representation of structured data. - - This document removes inconsistencies with other specifications of JSON, repairs specification errors, and offers experience-based interoperability guidance. - -Status of This Memo - -This is an Internet Standards Track document. - - This document is a product of the Internet Engineering Task Force (IETF). It represents the consensus of the IETF community. It has received public review and has been approved for publication by the Internet Engineering Steering Group (IESG). Further information on Internet Standards is available in Section 2 of RFC 7841. - - Information about the current status of this document, any errata, and how to provide feedback on it may be obtained at https://www.rfc-editor.org/info/rfc8259. - -Bray Standards Track [Page 1] - -Copyright Notice - - Copyright (c) 2017 IETF Trust and the persons identified as the document authors. All rights reserved. - - This document is subject to BCP 78 and the IETF Trust's Legal Provisions Relating to IETF Documents (https://trustee.ietf.org/license-info) in effect on the date of publication of this document. Please review these documents carefully, as they describe your rights and restrictions with respect to this document. Code Components extracted from this document must include Simplified BSD License text as described in Section 4.e of the Trust Legal Provisions and are provided without warranty as - -described in the Simplified BSD License. - - This document may contain material from IETF Documents or IETF Contributions published or made publicly available before November 10, 2008. The person(s) controlling the copyright in some of this material may not have granted the IETF Trust the right to allow modifications of such material outside the IETF Standards Process. Without obtaining an adequate license from the person(s) controlling the copyright in such materials, this document may not be modified outside the IETF Standards Process, and derivative works of it may not be created outside the IETF Standards Process, except to format it for publication as an RFC or to translate it into languages other than English. - -Bray Standards Track [Page 2] - -Table of Contents - -| 1.1. Conventions Used in This Document
4
1.2. Specifications of JSON
4
1.3. Introduction to This Revision
5
2. JSON Grammar
5
3. Values
6
4. Objects
6
5. Arrays
7
6. Numbers
7
7. Strings
8
8. String and Character Issues
9
8.1. Character Encoding
9
8.2. Unicode Characters 10
8.3. String Comparison 10
9. Parsers 10
10. Generators 10
11. IANA Considerations 11
12. Security Considerations 12
13. Examples 12
14. References 14
14.1. Normative References 14
14.2. Informative References 14
Appendix A. Changes from RFC 7159 16
Contributors 16 | 1. Introduction | | 3 | -|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------|--|---| -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | | | | -| | Author's Address 16 | | | - -## 1. Introduction - - JavaScript Object Notation (JSON) is a text format for the serialization of structured data. It is derived from the object literals of JavaScript, as defined in the ECMAScript Programming Language Standard, Third Edition [ECMA-262]. - - JSON can represent four primitive types (strings, numbers, booleans, and null) and two structured types (objects and arrays). - - A string is a sequence of zero or more Unicode characters [UNICODE]. Note that this citation references the latest version of Unicode rather than a specific release. It is not expected that future changes in the Unicode specification will impact the syntax of JSON. - - An object is an unordered collection of zero or more name/value pairs, where a name is a string and a value is a string, number, boolean, null, object, or array. - -An array is an ordered sequence of zero or more values. - -Bray Standards Track [Page 3] - - The terms "object" and "array" come from the conventions of JavaScript. - - JSON's design goals were for it to be minimal, portable, textual, and a subset of JavaScript. - -1.1. Conventions Used in This Document - - The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in BCP 14 [RFC2119] [RFC8174] when, and only when, they appear in all capitals, as shown here. - - The grammatical rules in this document are to be interpreted as described in [RFC5234]. - -1.2. Specifications of JSON - - This document replaces [RFC7159]. [RFC7159] obsoleted [RFC4627], which originally described JSON and registered the media type "application/json". - -JSON is also described in [ECMA-404]. - - The reference to ECMA-404 in the previous sentence is normative, not with the usual meaning that implementors need to consult it in order to understand this document, but to emphasize that there are no inconsistencies in the definition of the term "JSON text" in any of its specifications. Note, however, that ECMA-404 allows several practices that this specification recommends avoiding in the interests of maximal interoperability. - - The intent is that the grammar is the same between the two documents, although different descriptions are used. If there is a difference found between them, ECMA and the IETF will work together to update both documents. - - If an error is found with either document, the other should be examined to see if it has a similar error; if it does, it should be fixed, if possible. - - If either document is changed in the future, ECMA and the IETF will work together to ensure that the two documents stay aligned through the change. - -Bray Standards Track [Page 4] - -- 1.3. Introduction to This Revision - In the years since the publication of RFC 4627, JSON has found very wide use. This experience has revealed certain patterns that, while allowed by its specifications, have caused interoperability problems. - - Also, a small number of errata have been reported regarding RFC 4627 (see RFC Errata IDs 607 [Err607] and 3607 [Err3607]) and regarding RFC 7159 (see RFC Errata IDs 3915 [Err3915], 4264 [Err4264], 4336 [Err4336], and 4388 [Err4388]). - - This document's goal is to apply the errata, remove inconsistencies with other specifications of JSON, and highlight practices that can lead to interoperability problems. - -- 2. JSON Grammar - A JSON text is a sequence of tokens. The set of tokens includes six structural characters, strings, numbers, and three literal names. - - A JSON text is a serialized value. Note that certain previous specifications of JSON constrained a JSON text to be an object or an array. Implementations that generate only objects or arrays where a JSON text is called for will be interoperable in the sense that all implementations will accept these as conforming JSON texts. - -JSON-text = ws value ws - -These are the six structural characters: - -| begin-array | | | | = ws %x5B ws ; [ left square bracket | -|----------------------------------------|--|--|--|---------------------------------------| -| begin-object | | | | = ws %x7B ws ; { left curly bracket | -| end-array | | | | = ws %x5D ws ; ] right square bracket | -| end-object | | | | = ws %x7D ws ; } right curly bracket | -| name-separator = ws %x3A ws ; : colon | | | | | -| value-separator = ws %x2C ws ; , comma | | | | | - -Bray Standards Track [Page 5] - - Insignificant whitespace is allowed before or after any of the six structural characters. - - ws = *( %x20 / ; Space %x09 / ; Horizontal tab %x0A / ; Line feed or New line %x0D ) ; Carriage return - -## 3. Values - - A JSON value MUST be an object, array, number, or string, or one of the following three literal names: - - false null true - - The literal names MUST be lowercase. No other literal names are allowed. - - value = false / null / true / object / array / number / string false = %x66.61.6c.73.65 ; false null = %x6e.75.6c.6c ; null true = %x74.72.75.65 ; true - -## 4. Objects - - An object structure is represented as a pair of curly brackets surrounding zero or more name/value pairs (or members). A name is a string. A single colon comes after each name, separating the name from the value. A single comma separates a value from a following name. The names within an object SHOULD be unique. - - object = begin-object [ member *( value-separator member ) ] end-object - -member = string name-separator value - - An object whose names are all unique is interoperable in the sense that all software implementations receiving that object will agree on the name-value mappings. When the names within an object are not unique, the behavior of software that receives such an object is unpredictable. Many implementations report the last name/value pair only. Other implementations report an error or fail to parse the - -Bray Standards Track [Page 6] - - object, and some implementations report all of the name/value pairs, including duplicates. - - JSON parsing libraries have been observed to differ as to whether or not they make the ordering of object members visible to calling software. Implementations whose behavior does not depend on member ordering will be interoperable in the sense that they will not be affected by these differences. - -5. Arrays - - An array structure is represented as square brackets surrounding zero or more values (or elements). Elements are separated by commas. - -array = begin-array [ value *( value-separator value ) ] end-array - - There is no requirement that the values in an array be of the same type. - -6. Numbers - - The representation of numbers is similar to that used in most programming languages. A number is represented in base 10 using decimal digits. It contains an integer component that may be prefixed with an optional minus sign, which may be followed by a fraction part and/or an exponent part. Leading zeros are not allowed. - -A fraction part is a decimal point followed by one or more digits. - - An exponent part begins with the letter E in uppercase or lowercase, which may be followed by a plus or minus sign. The E and optional sign are followed by one or more digits. - - Numeric values that cannot be represented in the grammar below (such as Infinity and NaN) are not permitted. - - number = [ minus ] int [ frac ] [ exp ] decimal-point = %x2E ; . digit1-9 = %x31-39 ; 1-9 e = %x65 / %x45 ; e E exp = e [ minus / plus ] 1*DIGIT frac = decimal-point 1*DIGIT - -Bray Standards Track [Page 7] - - int = zero / ( digit1-9 *DIGIT ) minus = %x2D ; plus = %x2B ; + zero = %x30 ; 0 - - This specification allows implementations to set limits on the range and precision of numbers accepted. Since software that implements IEEE 754 binary64 (double precision) numbers [IEEE754] is generally available and widely used, good interoperability can be achieved by implementations that expect no more precision or range than these provide, in the sense that implementations will approximate JSON numbers within the expected precision. A JSON number such as 1E400 or 3.141592653589793238462643383279 may indicate potential interoperability problems, since it suggests that the software that created it expects receiving software to have greater capabilities for numeric magnitude and precision than is widely available. - - Note that when such software is used, numbers that are integers and are in the range [-(2**53)+1, (2**53)-1] are interoperable in the sense that implementations will agree exactly on their numeric values. - -## 7. Strings - - The representation of strings is similar to conventions used in the C family of programming languages. A string begins and ends with quotation marks. All Unicode characters may be placed within the quotation marks, except for the characters that MUST be escaped: quotation mark, reverse solidus, and the control characters (U+0000 through U+001F). - - Any character may be escaped. If the character is in the Basic Multilingual Plane (U+0000 through U+FFFF), then it may be represented as a six-character sequence: a reverse solidus, followed by the lowercase letter u, followed by four hexadecimal digits that encode the character's code point. The hexadecimal letters A through F can be uppercase or lowercase. So, for example, a string containing only a single reverse solidus character may be represented as "\u005C". - - Alternatively, there are two-character sequence escape representations of some popular characters. So, for example, a string containing only a single reverse solidus character may be represented more compactly as "\\". - -Bray Standards Track [Page 8] - - To escape an extended character that is not in the Basic Multilingual Plane, the character is represented as a 12-character sequence, encoding the UTF-16 surrogate pair. So, for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". - - string = quotation-mark *char quotation-mark char = unescaped / escape ( %x22 / ; " quotation mark U+0022 %x5C / ; \ reverse solidus U+005C %x2F / ; / solidus U+002F %x62 / ; b backspace U+0008 %x66 / ; f form feed U+000C %x6E / ; n line feed U+000A %x72 / ; r carriage return U+000D %x74 / ; t tab U+0009 %x75 4HEXDIG ) ; uXXXX U+XXXX escape = %x5C ; \ quotation-mark = %x22 ; " unescaped = %x20-21 / %x23-5B / %x5D-10FFFF - -# 8. String and Character Issues - -- 8.1. Character Encoding - JSON text exchanged between systems that are not part of a closed ecosystem MUST be encoded using UTF-8 [RFC3629]. - - Previous specifications of JSON have not required the use of UTF-8 when transmitting JSON text. However, the vast majority of JSON based software implementations have chosen to use the UTF-8 encoding, to the extent that it is the only encoding that achieves interoperability. - - Implementations MUST NOT add a byte order mark (U+FEFF) to the beginning of a networked-transmitted JSON text. In the interests of interoperability, implementations that parse JSON texts MAY ignore the presence of a byte order mark rather than treating it as an error. - -Bray Standards Track [Page 9] - -## 8.2. Unicode Characters - - When all the strings represented in a JSON text are composed entirely of Unicode characters [UNICODE] (however escaped), then that JSON text is interoperable in the sense that all software implementations that parse it will agree on the contents of names and of string values in objects and arrays. - - However, the ABNF in this specification allows member names and string values to contain bit sequences that cannot encode Unicode characters; for example, "\uDEAD" (a single unpaired UTF-16 surrogate). Instances of this have been observed, for example, when a library truncates a UTF-16 string without checking whether the truncation split a surrogate pair. The behavior of software that receives JSON texts containing such values is unpredictable; for example, implementations might return different values for the length of a string value or even suffer fatal runtime exceptions. - -## 8.3. String Comparison - - Software implementations are typically required to test names of object members for equality. Implementations that transform the textual representation into sequences of Unicode code units and then perform the comparison numerically, code unit by code unit, are interoperable in the sense that implementations will agree in all cases on equality or inequality of two strings. For example, implementations that compare strings with escaped characters unconverted may incorrectly find that "a\\b" and "a\u005Cb" are not equal. - -## 9. Parsers - - A JSON parser transforms a JSON text into another representation. A JSON parser MUST accept all texts that conform to the JSON grammar. A JSON parser MAY accept non-JSON forms or extensions. - - An implementation may set limits on the size of texts that it accepts. An implementation may set limits on the maximum depth of nesting. An implementation may set limits on the range and precision of numbers. An implementation may set limits on the length and character contents of strings. - -## 10. Generators - - A JSON generator produces JSON text. The resulting text MUST strictly conform to the JSON grammar. - -Bray Standards Track [Page 10] - -11. IANA Considerations - -The media type for JSON text is application/json. - -Type name: application - -Subtype name: json - -Required parameters: n/a - -Optional parameters: n/a - -Encoding considerations: binary - -Security considerations: See RFC 8259, Section 12 - -Interoperability considerations: Described in RFC 8259 - -Published specification: RFC 8259 - - Applications that use this media type: JSON has been used to exchange data between applications written in all of these programming languages: ActionScript, C, C#, Clojure, ColdFusion, Common Lisp, E, Erlang, Go, Java, JavaScript, Lua, Objective CAML, Perl, PHP, Python, Rebol, Ruby, Scala, and Scheme. - - Additional information: Magic number(s): n/a File extension(s): .json Macintosh file type code(s): TEXT - - Person & email address to contact for further information: IESG - -Intended usage: COMMON - -Restrictions on usage: none - - Author: Douglas Crockford - - Change controller: IESG - -Bray Standards Track [Page 11] - - Note: No "charset" parameter is defined for this registration. Adding one really has no effect on compliant recipients. - -- 12. Security Considerations - Generally, there are security issues with scripting languages. JSON is a subset of JavaScript but excludes assignment and invocation. - - Since JSON's syntax is borrowed from JavaScript, it is possible to use that language's "eval()" function to parse most JSON texts (but not all; certain characters such as U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR are legal in JSON but not JavaScript). This generally constitutes an unacceptable security risk, since the text could contain executable code along with data declarations. The same consideration applies to the use of eval()-like functions in any other programming language in which JSON texts conform to that language's syntax. - -## 13. Examples - - This is a JSON object: { "Image": { "Width": 800, "Height": 600, "Title": "View from 15th Floor", "Thumbnail": { "Url": "http://www.example.com/image/481989943", "Height": 125, "Width": 100 }, "Animated" : false, "IDs": [116, 943, 234, 38793] } } - - Its Image member is an object whose Thumbnail member is an object and whose IDs member is an array of numbers. - -Bray Standards Track [Page 12] - -``` - This is a JSON array containing two objects: -[ -{ -"precision": "zip", -"Latitude": 37.7668, -"Longitude": -122.3959, -"Address": "", -"City": "SAN FRANCISCO", -"State": "CA", -"Zip": "94107", -"Country": "US" -}, -{ -"precision": "zip", -"Latitude": 37.371991, -"Longitude": -122.026020, -"Address": "", -"City": "SUNNYVALE", -"State": "CA", -"Zip": "94085", -"Country": "US" -} -] -Here are three small JSON texts containing only values: -"Hello world!" -42 -true -``` -Bray Standards Track [Page 13] - -## 14. References - -- 14.1. Normative References -- [ECMA-404] Ecma International, "The JSON Data Interchange Format", Standard ECMA-404, . -- [IEEE754] IEEE, "IEEE Standard for Floating-Point Arithmetic", IEEE 754. -- [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, DOI 10.17487/RFC2119, March 1997, . -- [RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO 10646", STD 63, RFC 3629, DOI 10.17487/RFC3629, November 2003, . -- [RFC5234] Crocker, D., Ed. and P. Overell, "Augmented BNF for Syntax Specifications: ABNF", STD 68, RFC 5234, DOI 10.17487/RFC5234, January 2008, . -- [RFC8174] Leiba, B., "Ambiguity of Uppercase vs Lowercase in RFC 2119 Key Words", BCP 14, RFC 8174, DOI 10.17487/RFC8174, May 2017, . -- [UNICODE] The Unicode Consortium, "The Unicode Standard", . -- 14.2. Informative References -- [ECMA-262] Ecma International, "ECMAScript Language Specification", Standard ECMA-262, Third Edition, December 1999, . -- [Err3607] RFC Errata, Erratum ID 3607, RFC 4627, . -- [Err3915] RFC Errata, Erratum ID 3915, RFC 7159, . - -Bray Standards Track [Page 14] - -- [Err4264] RFC Errata, Erratum ID 4264, RFC 7159, . -- [Err4336] RFC Errata, Erratum ID 4336, RFC 7159, . -- [Err4388] RFC Errata, Erratum ID 4388, RFC 7159, . -- [Err607] RFC Errata, Erratum ID 607, RFC 4627, . -- [RFC4627] Crockford, D., "The application/json Media Type for JavaScript Object Notation (JSON)", RFC 4627, DOI 10.17487/RFC4627, July 2006, . -- [RFC7159] Bray, T., Ed., "The JavaScript Object Notation (JSON) Data Interchange Format", RFC 7159, DOI 10.17487/RFC7159, March 2014, . - -Bray Standards Track [Page 15] - -Appendix A. Changes from RFC 7159 - - This section lists changes between this document and the text in RFC 7159. - -- o Section 1.2 has been updated to reflect the removal of a JSON specification from ECMA-262, to make ECMA-404 a normative reference, and to explain the particular meaning of "normative". -- o Section 1.3 has been updated to reflect errata filed against RFC 7159, not RFC 4627. -- o Section 8.1 was changed to require the use of UTF-8 when transmitted over a network. -- o Section 12 has been updated to increase the precision of the description of the security risk that follows from using the ECMAScript "eval()" function. -- o Section 14.1 has been updated to include ECMA-404 as a normative reference. -- o Section 14.2 has been updated to remove ECMA-404, update the version of ECMA-262, and refresh the errata list. - -Contributors - - RFC 4627 was written by Douglas Crockford. This document was constructed by making a relatively small number of changes to that document; thus, the vast majority of the text here is his. - -Author's Address - - Tim Bray (editor) Textuality - -Email: tbray@textuality.com - -Bray Standards Track [Page 16] diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index 6fb55f39..b25c29e0 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -1,9 +1,10 @@ -import cocoindex -from markdown_it import MarkdownIt from datetime import timedelta import os import dataclasses +import cocoindex +from markdown_it import MarkdownIt + _markdown_it = MarkdownIt("gfm-like") @@ -20,19 +21,29 @@ class LocalFileTargetValues: class LocalFileTargetConnector: @staticmethod def get_persistent_key(spec: LocalFileTarget, target_name: str) -> str: + """Use the directory path as the persistent key for this target.""" return spec.directory @staticmethod def describe(key: str) -> str: + """Return a human-readable description of the target.""" return f"Local directory {key}" @staticmethod def apply_setup_change( key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None ) -> None: + """ + Apply setup changes to the target. + + Best practice: keep all actions idempotent. + """ + + # Create the directory if it didn't exist. if previous is None and current is not None: os.makedirs(current.directory, exist_ok=True) + # Delete the directory with its contents if it no longer exists. if previous is not None and current is None: if os.path.isdir(previous.directory): for filename in os.listdir(previous.directory): @@ -83,10 +94,10 @@ def custom_output_files( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope ) -> None: """ - Define an example flow that embeds text into a vector database. + Define an example flow that exports markdown files to HTML files. """ data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="files", included_patterns=["*.md"]), + cocoindex.sources.LocalFile(path="data", included_patterns=["*.md"]), refresh_interval=timedelta(seconds=5), ) From 9c94f4c708990568e85413eec47ad79772c481c9 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sun, 27 Jul 2025 14:08:24 -0700 Subject: [PATCH 8/9] docs: update `README` and `pyproject.toml` for `custom_output_files` --- README.md | 1 + examples/custom_output_files/README.md | 29 +++++++-------------- examples/custom_output_files/pyproject.toml | 8 ++---- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 69451a05..bdb18464 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ It defines an index flow like this: | [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend| | [Face Recognition](examples/face_recognition) | Recognize faces in images and build embedding index | | [Paper Metadata](examples/paper_metadata) | Index papers in PDF files, and build metadata tables for each paper | +| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* | More coming and stay tuned 👀! diff --git a/examples/custom_output_files/README.md b/examples/custom_output_files/README.md index dcdf6e0c..7d1df94f 100644 --- a/examples/custom_output_files/README.md +++ b/examples/custom_output_files/README.md @@ -2,23 +2,17 @@ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -In this example, we will build index flow from text embedding from local markdown files, and query the index. +In this example, we will build index flow to load data from a local directory, convert them to HTML, and save the data to another local directory powered by [CocoIndex Custom Targets](https://cocoindex.io/docs/custom_ops/custom_targets). We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. ## Steps -đŸŒ± A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) ### Indexing Flow -Screenshot 2025-05-19 at 5 48 28 PM - -1. We will ingest a list of local files. -2. For each file, perform chunking (recursively split) and then embedding. -3. We will save the embeddings and the metadata in Postgres with PGVector. - -### Query -We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow. +1. We ingest a list of local markdown files from the `data/` directory. +2. For each file, convert them to HTML using [markdown-it-py](https://markdown-it-py.readthedocs.io/). +3. We will save the HTML files to a local directory `output_html/`. ## Prerequisite @@ -32,22 +26,19 @@ Install dependencies: pip install -e . ``` -Setup: +Update the target: ```bash -cocoindex setup main.py +cocoindex update --setup main.py ``` -Update index: - -```bash -cocoindex update main.py -``` +You can add new files to the `data/` directory, delete or update existing files. +Each time when you run the `update` command, cocoindex will only re-process the files that have changed, and keep the target in sync with the source. -Run: +You can also run `update` command in live mode, which will keep the target in sync with the source in real-time: ```bash -python main.py +cocoindex update --setup -L main.py ``` ## CocoInsight diff --git a/examples/custom_output_files/pyproject.toml b/examples/custom_output_files/pyproject.toml index 0ecfa9c5..939389f4 100644 --- a/examples/custom_output_files/pyproject.toml +++ b/examples/custom_output_files/pyproject.toml @@ -1,13 +1,9 @@ [project] name = "custom-output-files" version = "0.1.0" -description = "Simple example for cocoindex: build embedding index based on local text files." +description = "Simple example for cocoindex: convert markdown files to HTML files and save them to a local directory." requires-python = ">=3.11" -dependencies = [ - "cocoindex>=0.1.67", - "markdown", - "markdown-it-py[linkify,plugins]", -] +dependencies = ["cocoindex>=0.1.74", "markdown-it-py[linkify,plugins]"] [tool.setuptools] packages = [] From ccb61417cb042cdb79691cdffd7652973e7004ae Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sun, 27 Jul 2025 14:13:07 -0700 Subject: [PATCH 9/9] example: update comments --- examples/custom_output_files/main.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index b25c29e0..5bbfa83d 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -9,11 +9,16 @@ class LocalFileTarget(cocoindex.op.TargetSpec): + """Represents the custom target spec.""" + + # The directory to save the HTML files. directory: str @dataclasses.dataclass class LocalFileTargetValues: + """Represents value fields of exported data. Used in `mutate` method below.""" + html: str @@ -26,7 +31,7 @@ def get_persistent_key(spec: LocalFileTarget, target_name: str) -> str: @staticmethod def describe(key: str) -> str: - """Return a human-readable description of the target.""" + """(Optional) Return a human-readable description of the target.""" return f"Local directory {key}" @staticmethod @@ -54,10 +59,10 @@ def apply_setup_change( @staticmethod def prepare(spec: LocalFileTarget) -> LocalFileTarget: """ - Prepare for execution. To run common operations before applying any mutations. + (Optional) Prepare for execution. To run common operations before applying any mutations. The returned value will be passed as the first element of tuples in `mutate` method. - This is optional. If not provided, will directly pass the spec to `mutate` method. + If not provided, will directly pass the spec to `mutate` method. """ return spec @@ -69,16 +74,21 @@ def mutate( Mutate the target. The first element of the tuple is the target spec. - The second element is a dictionary of mutations. - The key is the filename, and the value is the mutation. - If the value is `None`, the file will be removed. - Otherwise, the file will be written with the content. + The second element is a dictionary of mutations: + - The key is the filename, and the value is the mutation. + - If the value is `None`, the file will be removed. + Otherwise, the file will be written with the content. + + Best practice: keep all actions idempotent. """ for spec, mutations in all_mutations: for filename, mutation in mutations.items(): full_path = os.path.join(spec.directory, filename) + ".html" if mutation is None: - os.remove(full_path) + try: + os.remove(full_path) + except FileNotFoundError: + pass else: with open(full_path, "w") as f: f.write(mutation.html)