diff --git a/README.md b/README.md index 07f47f91b0..485abde66c 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ modeling and other text generation tasks. ### What's New: +- November 2019: [CamemBERT model and code released](examples/camembert/README.md) - November 2019: [BART model and code released](examples/bart/README.md) - November 2019: [XLM-R models and code released](examples/xlmr/README.md) - September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md) diff --git a/examples/camembert/README.md b/examples/camembert/README.md new file mode 100644 index 0000000000..69cc11b1aa --- /dev/null +++ b/examples/camembert/README.md @@ -0,0 +1,56 @@ +# CamemBERT: a French BERT + +## Introduction + +CamemBERT is a pretrained language model trained on 138GB of French text based on RoBERTa. + +## Pre-trained models + +Model | #params | vocab size | Download +---|---|---|--- +`CamemBERT` | 110M | 32k | [camembert.v0.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert.v0.tar.gz) + + +## Example usage + +##### Load CamemBERT from torch.hub (PyTorch >= 1.1): +```python +import torch +camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0') +camembert.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Load CamemBERT (for PyTorch 1.0 or custom models): +```python +# Download camembert model +wget https://dl.fbaipublicfiles.com/fairseq/models/camembert.v0.tar.gz +tar -xzvf camembert.v0.tar.gz + +# Load the model in fairseq +from fairseq.models.roberta import CamembertModel +camembert = CamembertModel.from_pretrained('/path/to/camembert.v0') +camembert.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Filling masks: +```python +masked_line = 'Le camembert est :)' +camembert.fill_mask(masked_line, topk=3) +# [('Le camembert est délicieux :)', 0.4909118115901947, ' délicieux'), +# ('Le camembert est excellent :)', 0.10556942224502563, ' excellent'), +# ('Le camembert est succulent :)', 0.03453322499990463, ' succulent')] +``` + +##### Extract features from Camembert: +```python +# Extract the last layer's features +line = "J'aime le camembert!" +tokens = camembert.encode(line) +last_layer_features = camembert.extract_features(tokens) +assert last_layer_features.size() == torch.Size([1, 10, 768]) + +# Extract all layer's features (layer 0 is the embedding layer) +all_layers = camembert.extract_features(tokens, return_all_hiddens=True) +assert len(all_layers) == 13 +assert torch.all(all_layers[-1] == last_layer_features) +``` diff --git a/examples/roberta/README.md b/examples/roberta/README.md index 15844d3e46..0c603f2002 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -8,7 +8,8 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l ### What's New: -- November 2019: Multilingual encoder (XLM-RoBERTa) is available [XLM-R](https://github.com/pytorch/fairseq/examples/xlmr). +- November 2019: French model (CamemBERT) is available [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert). +- November 2019: Multilingual encoder (XLM-RoBERTa) is available [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). - September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers). - August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers). - August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset). diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py index 279aba52dd..baf0bf28b9 100644 --- a/fairseq/models/roberta/hub_interface.py +++ b/fairseq/models/roberta/hub_interface.py @@ -58,7 +58,7 @@ def encode(self, sentence: str, *addl_sentences, no_separator=False) -> torch.Lo for s in addl_sentences: bpe_sentence += (' ' if not no_separator else '') bpe_sentence += ' ' + self.bpe.encode(s) + ' ' - tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False) + tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False, add_if_not_exist=False) return tokens.long() def decode(self, tokens: torch.LongTensor): @@ -146,8 +146,9 @@ def fill_mask(self, masked_input: str, topk: int = 5): [self.bpe.encode(text_span.rstrip()) for text_span in text_spans] ).strip() tokens = self.task.source_dictionary.encode_line( - ' ' + text_spans_bpe, - append_eos=True, + ' ' + text_spans_bpe + ' ', + append_eos=False, + add_if_not_exist=False, ) masked_index = (tokens == self.task.mask_idx).nonzero() @@ -168,6 +169,9 @@ def fill_mask(self, masked_input: str, topk: int = 5): topk_filled_outputs = [] for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')): predicted_token = self.bpe.decode(predicted_token_bpe) + # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306 + if predicted_token_bpe.startswith('\u2581'): + predicted_token = ' ' + predicted_token if " {0}".format(masked_token) in masked_input: topk_filled_outputs.append(( masked_input.replace( diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py index 0ce6059328..dd92016af7 100644 --- a/fairseq/models/roberta/model.py +++ b/fairseq/models/roberta/model.py @@ -218,6 +218,29 @@ def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_na return RobertaHubInterface(x['args'], x['task'], x['models'][0]) +@register_model('camembert') +class CamembertModel(RobertaModel): + @classmethod + def hub_models(cls): + return { + 'camembert.v0': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert.v0.tar.gz', + } + + @classmethod + def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs): + from fairseq import hub_utils + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return RobertaHubInterface(x['args'], x['task'], x['models'][0]) + + class RobertaLMHead(nn.Module): """Head for masked language modeling."""