<a href="https://colab.research.google.com/github/gmihaila/ml_things/blob/master/playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bert Models

In [None]:
import io

table = r"""+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Architecture       | Shortcut name                                              | Details of the model                                                                                                                  |
+====================+============================================================+=======================================================================================================================================+
| BERT               | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on lower-cased English text.                                                                                                |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 336M parameters.                                                                                   |
|                    |                                                            | | Trained on lower-cased English text.                                                                                                |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 109M parameters.                                                                                    |
|                    |                                                            | | Trained on cased English text.                                                                                                      |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 335M parameters.                                                                                   |
|                    |                                                            | | Trained on cased English text.                                                                                                      |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 168M parameters.                                                        |
|                    |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 179M parameters.                                                             |
|                    |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 103M parameters.                                                                                    |
|                    |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 336M parameters.                                                                                   |
|                    |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 335M parameters.                                                                                   |
|                    |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 336M parameters.                                                                                   |
|                    |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 335M parameters                                                                                    |
|                    |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 111M parameters.                                                                                    |
|                    |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies,                     |
|                    |                                                            | | `fugashi <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.               |
|                    |                                                            | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them.                  |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 111M parameters.                                                                                    |
|                    |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies,                     |
|                    |                                                            | | `fugashi <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.               |
|                    |                                                            | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them.                  |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 90M parameters.                                                                                     |
|                    |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 90M parameters.                                                                                     |
|                    |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 125M parameters.                                                                                    |
|                    |                                                            | | Trained on cased Finnish text.                                                                                                      |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on uncased Finnish text.                                                                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | Trained on cased Dutch text.                                                                                                        |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT                | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | OpenAI GPT English model                                                                                                            |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT-2              | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
|                    |                                                            | | OpenAI GPT-2 English model                                                                                                          |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
|                    |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
|                    |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
|                    |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Transformer-XL     | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
|                    |                                                            | | English model trained on wikitext-103                                                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| XLNet              | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
|                    |                                                            | | XLNet English model                                                                                                                 |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
|                    |                                                            | | XLNet Large English model                                                                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| XLM                | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
|                    |                                                            | | XLM English model                                                                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
|                    |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
|                    |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
|                    |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
|                    |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
|                    |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
|                    |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
|                    |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
|                    |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
|                    |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| RoBERTa            | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
|                    |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
|                    |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
|                    |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
|                    |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
|                    |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
|                    |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| DistilBERT         | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
|                    |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
|                    |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
|                    |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| CTRL               | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
|                    |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| CamemBERT          | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
|                    |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| ALBERT             | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
|                    |                                                            | | ALBERT base model                                                                                                                   |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
|                    |                                                            | | ALBERT large model                                                                                                                  |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
|                    |                                                            | | ALBERT xlarge model                                                                                                                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
|                    |                                                            | | ALBERT xxlarge model                                                                                                                |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
|                    |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
|                    |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
|                    |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
|                    |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| T5                 | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| XLM-RoBERTa        | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
|                    |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
|                    |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| FlauBERT           | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
|                    |                                                            | | FlauBERT small architecture                                                                                                         |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
|                    |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
|                    |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
|                    |                                                            | | FlauBERT large architecture                                                                                                         |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Bart               | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``facebook/bart-base``                                     | | 12-layer, 768-hidden, 16-heads, 139M parameters                                                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
|                    |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
|                    |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| DialoGPT           | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Reformer           | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
|                    |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
|                    |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| MarianMT           | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
|                    |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Pegasus            | ``google/pegasus-{dataset}``                               | | 16-layer, 1024-hidden, 16-heads, ~568M parameter, 2.2 GB for summary. `model list <https://huggingface.co/models?search=pegasus>`__ |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Longformer         | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
|                    |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
|                    |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| MBart              | ``facebook/mbart-large-cc25``                              | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
|                    |                                                            | | mBART (bart-large architecture) model trained on 25 languages' monolingual corpus                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``facebook/mbart-large-en-ro``                             | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
|                    |                                                            | | mbart-large-cc25 model finetuned on WMT english romanian translation.                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Lxmert             | ``lxmert-base-uncased``                                    | | 9-language layers, 9-relationship layers, and 12-cross-modality layers                                                              |
|                    |                                                            | | 768-hidden, 12-heads (for each layer) ~ 228M parameters                                                                             |
|                    |                                                            | | Starting from lxmert-base checkpoint, trained on over 9 million image-text couplets from COCO, VisualGenome, GQA, VQA               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Funnel Transformer | ``funnel-transformer/small``                               | | 14 layers: 3 blocks of 4 layers then 2 layers decoder, 768-hidden, 12-heads, 130M parameters                                        |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/small-base``                          | | 12 layers: 3 blocks of 4 layers (no decoder), 768-hidden, 12-heads, 115M parameters                                                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/medium``                              | | 14 layers: 3 blocks 6, 3x2, 3x2 layers then 2 layers decoder, 768-hidden, 12-heads, 130M parameters                                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/medium-base``                         | | 12 layers: 3 blocks 6, 3x2, 3x2 layers(no decoder), 768-hidden, 12-heads, 115M parameters                                           |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/intermediate``                        | | 20 layers: 3 blocks of 6 layers then 2 layers decoder, 768-hidden, 12-heads, 177M parameters                                        |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/intermediate-base``                   | | 18 layers: 3 blocks of 6 layers (no decoder), 768-hidden, 12-heads, 161M parameters                                                 |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/large``                               | | 26 layers: 3 blocks of 8 layers then 2 layers decoder, 1024-hidden, 12-heads, 386M parameters                                       |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/large-base``                          | | 24 layers: 3 blocks of 8 layers (no decoder), 1024-hidden, 12-heads, 358M parameters                                                |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/xlarge``                              | | 32 layers: 3 blocks of 10 layers then 2 layers decoder, 1024-hidden, 12-heads, 468M parameters                                      |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``funnel-transformer/xlarge-base``                         | | 30 layers: 3 blocks of 10 layers (no decoder), 1024-hidden, 12-heads, 440M parameters                                               |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| LayoutLM           | ``microsoft/layoutlm-base-uncased``                        | | 12 layers, 768-hidden, 12-heads, 113M parameters                                                                                    |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
+                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``microsoft/layoutlm-large-uncased``                       | | 24 layers, 1024-hidden, 16-heads, 343M parameters                                                                                   |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| DeBERTa            | ``microsoft/deberta-base``                                 | | 12-layer, 768-hidden, 12-heads, ~125M parameters                                                                                    |
|                    |                                                            | | DeBERTa using the BERT-base architecture                                                                                            |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``microsoft/deberta-large``                                | | 24-layer, 1024-hidden, 16-heads, ~390M parameters                                                                                   |
|                    |                                                            | | DeBERTa using the BERT-large architecture                                                                                           |
|                    |                                                            |                                                                                                                                       |
|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| SqueezeBERT        | ``squeezebert/squeezebert-uncased``                        | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone.                                 |
|                    |                                                            | | SqueezeBERT architecture pretrained from scratch on masked language model (MLM) and sentence order prediction (SOP) tasks.          |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``squeezebert/squeezebert-mnli``                           | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone.                                 |
|                    |                                                            | | This is the squeezebert-uncased model finetuned on MNLI sentence pair classification task with distillation from electra-base.      |
|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|                    | ``squeezebert/squeezebert-mnli-headless``                  | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone.                                 |
|                    |                                                            | | This is the squeezebert-uncased model finetuned on MNLI sentence pair classification task with distillation from electra-base.      |
|                    |                                                            | | The final classification layer is removed, so when you finetune, the final layer will be reinitialized.                             |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
"""

table = table.splitlines()

architecture = None
name = None
details = []
lines = ''

for line in table[2:]:
  if '+' in line:
    continue
  # print(line.split('|'))
  tmp_architecture = line.split('|')[1].strip()
  tmp_name = line.split('|')[2].strip()
  tmp_details = ' '.join(line.split('|')[3:])
  tmp_details = tmp_details.strip()

  details.append(tmp_details)
  

  if tmp_name:
    name = tmp_name if name is None else name
    if name != tmp_name:
      details = ' '.join(details)
      name = name[2:-2]
      lines += '%s\t%s\t%s\n'%(architecture, name, details)
      print(architecture, name, details)
      name = tmp_name
      details = []
    
  
  if tmp_architecture:
    architecture = tmp_architecture
    
io.open('transformers_pretrained_models.txt', 'w', encoding='utf-8').write(lines)

BERT bert-base-uncased 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on lower-cased English text. 24-layer, 1024-hidden, 16-heads, 336M parameters.
BERT bert-large-uncased Trained on lower-cased English text. 12-layer, 768-hidden, 12-heads, 109M parameters.
BERT bert-base-cased Trained on cased English text. 24-layer, 1024-hidden, 16-heads, 335M parameters.
BERT bert-large-cased Trained on cased English text. (Original, not recommended) 12-layer, 768-hidden, 12-heads, 168M parameters.
BERT bert-base-multilingual-uncased Trained on lower-cased text in the top 102 languages with the largest Wikipedias  (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__). (New, **recommended**) 12-layer, 768-hidden, 12-heads, 179M parameters.
BERT bert-base-multilingual-cased Trained on cased text in the top 104 languages with the largest Wikipedias  (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__). 12-layer, 768-hidden,

20587

In [None]:
# Extra code.
import io

model_name = None
# Read all pretrained models
pretrained_models = io.open('transformers_pretrained_models.txt', mode='r', encoding='utf-8').read().splitlines()

# Find next runnign model
for model_index in range(len(pretrained_models)):
  model_info = pretrained_models[model_index].split('\t')
  if len(model_info) == 3:
    # found model not ran
    model_architecture, model_name, model_details = model_info
    break

# Make sure to break if finished running all models.
if model_name is None:
  raise ValueError('Finished running all models!')

# Add as failed. We don't know if it will succeed or not.
pretrained_models[model_index] = 'Failed\t%s\t%s\t%s'%(model_architecture, model_name, model_details)

# Update file.
io.open('transformers_pretrained_models.txt', mode='w', encoding='utf-8').write('\n'.join(pretrained_models))

ValueError: ignored

In [22]:
import io

# markdown_table = ['|%s|%s|%s|\n'%('Status', 'Architecture', 'Shortcut name'),
#                   '|:-|:-|:-|\n',
#                   ]

all_status = {'Worked':[], 'Failed':[]}
outputs_models = io.open(file='/content/transformers_pretrained_models.txt', mode='r', encoding='utf-8').read().splitlines()

for line in outputs_models:
  status, architecutre, model, details = line.split('\t')
  markdown_table.append('|%s|%s|%s|\n'%(status, architecutre, model))
  all_status[status].append(model)
  # break

print('Worked')
tmp = ["`%s`"%m for m in all_status['Worked']]

print(', '.join(tmp))

len(all_status['Failed'])

Worked
`bert-base-uncased`, `bert-large-uncased`, `bert-base-cased`, `bert-large-cased`, `bert-base-multilingual-uncased`, `bert-base-multilingual-cased`, `bert-base-chinese`, `bert-base-german-cased`, `bert-large-uncased-whole-word-masking`, `bert-large-cased-whole-word-masking`, `bert-large-uncased-whole-word-masking-finetuned-squad`, `bert-large-cased-whole-word-masking-finetuned-squad`, `bert-base-cased-finetuned-mrpc`, `bert-base-german-dbmdz-cased`, `bert-base-german-dbmdz-uncased`, `TurkuNLP/bert-base-finnish-cased-v1`, `TurkuNLP/bert-base-finnish-uncased-v1`, `wietsedv/bert-base-dutch-cased`, `xlnet-base-cased`, `xlnet-large-cased`, `xlm-mlm-en-2048`, `xlm-mlm-ende-1024`, `xlm-mlm-enfr-1024`, `xlm-mlm-enro-1024`, `xlm-mlm-xnli15-1024`, `xlm-mlm-tlm-xnli15-1024`, `xlm-clm-enfr-1024`, `xlm-clm-ende-1024`, `xlm-mlm-17-1280`, `roberta-base`, `roberta-large`, `distilroberta-base`, `roberta-base-openai-detector`, `roberta-large-openai-detector`, `distilbert-base-uncased`, `distilbert

33

Worked: `bert-base-uncased`, `bert-large-uncased`, `bert-base-cased`, `bert-large-cased`, `bert-base-multilingual-uncased`, `bert-base-multilingual-cased`, `bert-base-chinese`, `bert-base-german-cased`, `bert-large-uncased-whole-word-masking`, `bert-large-cased-whole-word-masking`, `bert-large-uncased-whole-word-masking-finetuned-squad`, `bert-large-cased-whole-word-masking-finetuned-squad`, `bert-base-cased-finetuned-mrpc`, `bert-base-german-dbmdz-cased`, `bert-base-german-dbmdz-uncased`, `TurkuNLP/bert-base-finnish-cased-v1`, `TurkuNLP/bert-base-finnish-uncased-v1`, `wietsedv/bert-base-dutch-cased`, `xlnet-base-cased`, `xlnet-large-cased`, `xlm-mlm-en-2048`, `xlm-mlm-ende-1024`, `xlm-mlm-enfr-1024`, `xlm-mlm-enro-1024`, `xlm-mlm-xnli15-1024`, `xlm-mlm-tlm-xnli15-1024`, `xlm-clm-enfr-1024`, `xlm-clm-ende-1024`, `xlm-mlm-17-1280`, `roberta-base`, `roberta-large`, `distilroberta-base`, `roberta-base-openai-detector`, `roberta-large-openai-detector`, `distilbert-base-uncased`, `distilbert-base-uncased-distilled-squad`, `distilbert-base-cased`, `distilbert-base-cased-distilled-squad`, `distilbert-base-german-cased`, `distilbert-base-multilingual-cased`, `camembert-base`, `albert-base-v1`, `albert-large-v1`, `albert-xlarge-v1`, `albert-xxlarge-v1`, `albert-base-v2`, `albert-large-v2`, `albert-xlarge-v2`, `albert-xxlarge-v2`, `xlm-roberta-base`, `xlm-roberta-large`, `flaubert/flaubert_small_cased`, `flaubert/flaubert_base_uncased`, `flaubert/flaubert_base_cased`, `flaubert/flaubert_large_cased`, `facebook/bart-large`, `facebook/bart-base`, `facebook/bart-large-cnn`, `allenai/longformer-base-4096`, `allenai/longformer-large-4096`, `funnel-transformer/small`, `funnel-transformer/small-base`, `funnel-transformer/medium`, `funnel-transformer/medium-base`, `funnel-transformer/intermediate`, `funnel-transformer/intermediate-base`, `funnel-transformer/large`, `funnel-transformer/large-base`, `funnel-transformer/xlarge`, `funnel-transformer/xlarge-base`, `microsoft/deberta-base`, `microsoft/deberta-large`, `squeezebert/squeezebert-uncased`

# Wandb

In [None]:
import wandb


# configuration of run - what parameters to use
config_run = dict(
    dataset = 'my_dataset',
    model_architecture = 'bert-base-cased',
    epchs = 2,
    batches = 32,
    learning_rate = 1e-5,
    )

# initialization wandb
init_run = dict(
    project = 'test',   # name of project
    name = 'Real Run',  # name of current run
    notes = 'Comments', # any comments
    resume = False,     # if need to resume - for any updates of same run
    config = config_run # configuration of parameters
    )

# initialize wandb
wandb.init(**init_wandb);

[34m[1mwandb[0m: Waiting for W&B process to finish, PID 1211
[34m[1mwandb[0m: Program ended successfully.





[34m[1mwandb[0m: \ 0.01MB of 0.01MB uploaded (0.00MB deduped)[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: Find user logs for this run at: wandb/run-20200928_160606-2bvkr9yv/logs/debug.log
[34m[1mwandb[0m: Find internal logs for this run at: wandb/run-20200928_160606-2bvkr9yv/logs/debug-internal.log
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Synced [33mReal Run[0m: [34mhttps://wandb.ai/gm0234/test/runs/2bvkr9yv[0m
[34m[1mwandb[0m: Tracking run with wandb version 0.10.2
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20200928_160724-evoex792
[34m[1mwandb[0m: Syncing run [33mReal Run[0m





In [None]:
for i in range(10):
  
  wandb.log({'accuracy': 0.9*i, 'epoch': i})

wandb.log({"random": wandb.Histogram([i])})

In [None]:
# Visualize single plot
wandb.sklearn.plot_confusion_matrix(y_true, y_pred, target_names)

In [None]:
wandb.log(report)

In [None]:
wandb.run.save()



True

In [None]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 1, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']

report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)

print(report)

report['class 0']

{'class 0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1}, 'class 1': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1}, 'class 2': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3}, 'accuracy': 0.8, 'macro avg': {'precision': 0.8333333333333334, 'recall': 0.8888888888888888, 'f1-score': 0.8222222222222223, 'support': 5}, 'weighted avg': {'precision': 0.9, 'recall': 0.8, 'f1-score': 0.8133333333333332, 'support': 5}}


{'f1-score': 1.0, 'precision': 1.0, 'recall': 1.0, 'support': 1}

In [None]:
def make_archive(source, destination):
    base_name = '.'.join(destination.split('.')[:-1])
    format = destination.split('.')[-1]
    root_dir = os.path.dirname(source)
    base_dir = os.path.basename(source.strip(os.sep))
    print(base_name, format, root_dir, base_dir)
    shutil.make_archive(base_name, format, root_dir, base_dir)
    return destination

make_archive('/content/mlruns/0/%s'%run_dir, '/content/%s.zip'%run_dir)


/content/34ff5c1757d545e18b32fe64c3246f5f zip /content/mlruns/0 34ff5c1757d545e18b32fe64c3246f5f


'/content/34ff5c1757d545e18b32fe64c3246f5f.zip'

In [None]:
import mlflow
import shutil
import os
from google.colab import files

# end any previous mlflow started
mlflow.end_run()

# remove privous mlflow folder
shutil.rmtree('/content/mlruns') if os.path.isdir('/content/mlruns') else None;

print('Starting new mlflow run!')
# strating new mlflow
mlflow.start_run()

Starting new mlflow run!


<ActiveRun: >

In [None]:
# log tag
mlflow.set_tag("version", "tf-keras")

# log parameters
mlflow.log_param("epochs", 2)
mlflow.log_param("batch", 2)
mlflow.log_param("valid_split", 1)
mlflow.log_param("optimizer", 2)
mlflow.log_param("loss", 2)


[mlflow.log_metric(key='loss', value=value, step=index) for index, value in enumerate(range(10))];

# log artifacts
# mlflow.log_artifact("tf_keras_mlflow.py")


In [None]:
# stop mlflow
mlflow.end_run()
# get run directory name
run_dir = os.listdir('/content/mlruns/0'); run_dir.remove('meta.yaml'); run_dir = run_dir[0]
# make archive
path_zip = shutil.make_archive('/content/mlruns/%s'%run_dir, 'zip')



In [None]:
make_archive('mlruns/%s'%run_dir, '/content/%s.zip'%run_dir)
    
# download locally
files.download(path_zip)

# copy to drive

FileNotFoundError: ignored

In [None]:




run_dir

'60f9acfce02a483a912c06d52d11d6c5'

In [None]:
a

# Lecture Students

In [None]:
# lecture sections
section_paths = {'001':'/content/2020-09-18T1837_Grades-CSCE_2110.001_(7210).csv',
         '002':'/content/2020-09-19T1013_Grades-CSCE_2110.002_(15048).csv',
         '004':'/content/2020-09-19T1013_Grades-CSCE_2110.004_(18759).csv',
         '005':'/content/2020-09-19T1014_Grades-CSCE_2110.005_(18760).csv',
}


student_sec = {}
euid_sec = {}

for sec_num, sec_path in section_paths.items():

  tmp_df = pd.read_csv(sec_path)

  students = tmp_df['Student'].values[1:]
  euids = tmp_df['SIS Login ID'].values[1:]

  for student, euid in zip(students, euids):
    student_sec[student] = sec_num
    euid_sec[euid] = sec_num


# add missing / innactive
student_sec['Demoss, Harrison'] = '002'
euid_sec['hd0162'] = '002'

len(student_sec)

203

# Sec 281 Recitation

In [None]:
# zoom attendance
participants = pd.read_csv('/content/participants_99210279764.csv')
student_present = [', '.join(reversed(name.split())) for name in participants['Name (Original Name)'].values]

# project master list
df = pd.read_csv('/content/2110_sec281_proj1.csv')

sec_281 = {'Student':df['Name'].values[0:],
           'EUID':df['Unnamed: 1'].values[0:],
           'Group ID':df['Group ID'].values[0:],
           'Lecture Section':[student_sec[student] for student in df['Name'].values[0:]],
           'Project 1 Expectation':['present']*len(df['Name'].values[0:]),
           'Project 1 Design':['present' if name in student_present else 'absent' for name in df['Name'].values[0:]]}

pd.DataFrame(sec_281).to_csv('2110_sec281_proj1_master.csv', index=False)

# student missing from list - maybe misspelled name
missing_student = [student for student in student_present if student not in df['Name'].values[0:]]
missing_student

['Mihaila, George',
 'Alan',
 'vanshikaganga',
 'Aryan, Agarwal',
 'Soto, Edwin',
 'sterzenbach, ryan',
 'abdelhamid, waleed',
 'Ryan, Spencer#',
 'Katta, Kumar, Sai, Tarun']

Alan - Alan Mateo

vanshikaganga - Vanshika Ganga

 Aryan Agarwal 

 Soto-Villela, Edwin



# Sec 205

# DB

In [None]:
def create_db(db_name):
  '''
    CREATE DATABASE IF NOT EXISTS.
  '''
 
  try: 
    # CHECK IF DB EXISTS OR NOT
    if not os.path.isfile(db_name):
      # DB DOES NOT EXIST - CREATE DB
      conn = sqlite3.connect(db_name)
      # DB CONNECTION
      c = conn.cursor()
      # CREATE TABLE IN DB
      c.execute('''CREATE TABLE jupyter_talon
                  (euid, first_login, last_login, local_port, talon_port, login_node, count_logins, pid_session , state_session)''')
      # SAVE (COMMIT) THE CHANGES
      conn.commit()
      # CLOSE CONNECITON
      conn.close()
      return True
  except Exception as e:
    print("DB FAILED!", e)
    return False


def add_db(db_name, euid, last_login, local_port, talon_port, login_node, pid_session, state_session):
  '''
    ADD INSTANCE IN DATABASE
    COLUMNS: 'euid', 'first_login', 'last_login', 'local_port', 'talon_port', 'login_node', 'count_logins', 'pid_session', 'state_session'
    state_session:  'initiated' [login is started]
                    'running'   [login is successfull]
                    'ended'     [session ended]
  '''
  # DB CONNECTION
  conn = None
  # DB CURSOR
  c = None
  try: 
    # CHECK IF DB EXISTS
    if os.path.isfile(db_name):
      # DB EXISTS - JUST ESTABLISH CONNECTION
      print('DataBase %s found!' % db_name)
      # CONNECT TO DB
      conn = sqlite3.connect(db_name)
      # DB CONNECTION
      c = conn.cursor()
    else:
      # DB DOES NOT EXIST
      print("DataBase %s NOT found! Please run: 'create_db(db_name)'" % db_name)
      return

    # CHECK IF EUID ALREADY EXISTS
    euids = list(c.execute('SELECT euid FROM jupyter_talon ORDER BY euid').fetchall())
    if (euid,) in euids:
      # EUID ALREADY IN DB
      print('USER IN DB')
      # GRAB NUMBER OF LOGINS
      count_logins = (c.execute('SELECT count_logins FROM jupyter_talon WHERE euid=?',(euid,)).fetchall())[0][0]
      # ONLY COUNT AS LOGIN WHEN RUNNING
      if state_session == 'running':
        # INCREMENT LOGINS
        count_logins += 1
      # INCREMENT count_login
      c.execute('UPDATE jupyter_talon SET last_login=?,\
                                          local_port=?,\
                                          talon_port=?,\
                                          login_node=?,\
                                          count_logins=?,\
                                          pid_session=?,\
                                          state_session=? WHERE euid=?',(
                                          last_login, 
                                          local_port, 
                                          talon_port, 
                                          login_node,
                                          count_logins, 
                                          pid_session, 
                                          state_session, 
                                          euid))
    else:
      # EUID NOT IN DB
      # ADD EUID IN DB
      first_login = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
      c.execute('INSERT INTO jupyter_talon VALUES (?,?,?,?,?,?,?,?,?)',(euid, 
                                                                      first_login, 
                                                                      last_login, 
                                                                      local_port, 
                                                                      talon_port, 
                                                                      login_node, 
                                                                      0,
                                                                      pid_session, 
                                                                      state_session))
    # SAVE (COMMIT) THE CHANGES
    conn.commit()
    # CLOSE CONNECITON
    conn.close()
  except Exception as e:
    print("DB FAILED!", e)
  return


def from_db(db_name, euid):
  row = None
  columns = ['first_login', 'last_login', 'local_port', 'talon_port', 'login_node', 'count_logins', 'pid_session', 'state_session']
  try:
    # CHECK IF DB EXISTS OR NOT
    if os.path.isfile(db_name):
      # CONNECT TO DB
      conn = sqlite3.connect(db_name)
      # DB CONNECTION
      c = conn.cursor()
      # CHECK IF EUID EXISTS
      euids = list(c.execute('SELECT euid FROM jupyter_talon ORDER BY euid').fetchall())
      if (euid,) in euids:
        # EXTRACT ROW
        row = list(c.execute('SELECT first_login,\
                                    last_login,\
                                    local_port,\
                                    talon_port,\
                                    login_node,\
                                    count_logins,\
                                    pid_session,\
                                    state_session FROM jupyter_talon WHERE euid=?',(euid,)).fetchall())
        # DICTIONARY FORMAT
        row = {k:v for k,v in zip(columns, row[0])}
      else:
        print('EUID %s NOT ADDED TO DB! ' % euid)
    else:
      print("DB %s does not exist!" % db_name)
  except Exception as e:
    print("DB FAILED!", e)
  return row

In [None]:
import sqlite3
import os
from datetime import datetime

db_name = 'jupyter_talon_usage.db'

create_db(db_name=db_name)

add_db( db_name='jupyter_talon_usage.db',
        euid = 'gm0234',
        last_login = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        local_port = 8888,
        talon_port = 8888,
        login_node = 'vis.acs.unt.edu',
        pid_session = 356,
        state_session = 'running')


euid_log = from_db(db_name=db_name, euid='gm0234')

euid_log

DataBase jupyter_talon_usage.db found!
USER IN DB


{'count_logins': 7,
 'first_login': '2020-02-24 23:20:53',
 'last_login': '2020-02-24 23:26:14',
 'local_port': 8888,
 'login_node': 'vis.acs.unt.edu',
 'pid_session': 356,
 'state_session': 'running',
 'talon_port': 8888}

In [None]:
add_db( db_name='jupyter_talon_usage.db',
        euid = 'gm0234',
        last_login = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        local_port = 82888,
        talon_port = 8888,
        login_node = 'vis.acs.unt.edu',
        pid_session = 356,
        state_session = 'running')

DataBase jupyter_talon_usage.db found!
USER IN DB


In [None]:
get_ports(db_name='jupyter_talon_usage.db')

[82888, 8888, 8888, 8808, 8808, 8808, 8808, 8208]

# Ports

In [None]:
def get_ports(db_name):
  # CHECK IF DB EXISTS OR NOT
  if os.path.isfile(db_name):
    try:
      # CONNECT TO DB
      conn = sqlite3.connect(db_name)
      # DB CONNECTION
      c = conn.cursor()
      # CHECK IF EUID EXISTS
      ports = list(c.execute('SELECT local_port FROM jupyter_talon').fetchall())
      return [port[0] for port in ports]

    except Exception as e:
      print("DB READ FAILED!", e)
      return None

  else:
    print("DB %s DOES NOT EXIST!" % db_name)
    return None

In [None]:
import os
from datetime import datetime

def logger(user, message, level, fname='logs.log', verbose=True, extra_log=True):
  """Logging function

  Args:
    user: user id
    message: text needed logged
    level: 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'
    fname: file name to save all logs
    verbose: if print to stdou
    extra_log: create 'logs/' and write individual logs for each user

  Source: https://docs.python.org/2/howto/logging.html
  """
  
  # get time of log
  time_log = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  # check input arguments types
  assert level in ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET']
  assert str(user) and str(message)
  # create log line from message and date
  line = '%s %s %s %s'%(time_log, str(user), level, str(message))
  # print to stdout if veborse
  if verbose: print(line)
  # append to main log file
  with open(fname, 'a') as f:
    f.write(line + '\n')
  if extra_log:
    # create if folder does not exist
    if os.path.isdir('logs') is False: os.mkdir('logs')
    # append to user log file
    with open('logs/%s.log'%user, 'a') as f:
      f.write(line + '\n')
  return


logger(user='gm0234', message='This is a test', level='DEBUG', verbose=True)

2020-02-10 16:26:02 gm0234 DEBUG This is a test


In [None]:
if os.path.isdir('logs') is False:
  os.mkdir('logs')

sdad


In [None]:
import logging
if __name__ == '__main__':

  logging.basicConfig(filename='/content/my_logs.log', filemode='a', format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.DEBUG)

  logging.debug('This message should appear on the console')
  logging.info('So should this')
  logging.warning('And this, too')
  logging.error('this will')

my_logs.log
my_logs.log
my_logs.log
my_logs.log


In [None]:
def tryPort(port):
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    result = False
    try:
        sock.bind(("0.0.0.0", port))
        result = True
    except:
        print("Port is in use")
    sock.close()
    return result

tryPort(8080)

Port is in use


False

In [None]:
def find_free_port(min_port=None, max_port=None):
  if min_port and max_port:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    for port in range(min_port,max_port,1):
      try:
        sock.bind(("0.0.0.0", port))
        sock.close()
        return port
        break
      except:
        continue
  else:
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return s.getsockname()[1]

find_free_port(min_port=9000,max_port=10000)

9001

In [None]:
min_port=8080
max_port=50000

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
for port in range(min_port,max_port,1):
  print('use',port)
  try:
    sock.bind(("0.0.0.0", port))
    print(port)
    sock.close()
    break
  except:
    continue
sock.close()

use 8080
use 8081
8081


In [None]:
import socket
from contextlib import closing


def find_free_port(min_port=None, max_port=None):
  if min_port and max_port:
    for port in range(min_port,max_port,1):
      print(port)
      with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        if s.getsockname()[1] == port:
          return port

  else:
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return s.getsockname()[1]

a = find_free_port(min_port=40000, max_port=50000)

a

In [None]:
import socket
from contextlib import closing

def find_free_port():
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return s.getsockname()[1]

a = find_free_port()

a

56873

In [None]:
'djn%s'%None

'djnNone'

In [None]:
def is_port_in_use(port):
  import socket
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
    return s.connect_ex(('localhost', port)) == 0

In [None]:
def port_used(port):
  import socket, errno
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  try:
      s.bind(("127.0.0.1", port))
  except socket.error as e:
      if e.errno == errno.EADDRINUSE:
          print("Port is already in use")
      else:
          # something else raised the socket.error exception
          print(e)
  s.close()

In [None]:
python -c 'import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); print("port_used",s.connect_ex(("localhost", 39634)) == 0)'

# Encrypt

In [None]:
import base64
print(base64.b64encode("jupyter20$".encode("utf-8")))
print(base64.b64decode("anVweXRlcjIwJA==").decode("utf-8"))

b'anVweXRlcjIwJA=='
jupyter20$
