deeppavlov · Kolpnick · Nov 8, 2022 · Nov 22, 2022 · Jan 10, 2023 · Feb 17, 2023
diff --git a/deeppavlov/configs/classifiers/emotions_xlm_roberta_base.json b/deeppavlov/configs/classifiers/emotions_xlm_roberta_base.json
@@ -0,0 +1,105 @@
+{
+  "dataset_reader": {
+    "class_name": "huggingface_dataset_reader",
+    "path": "cedr",
+    "name": "main",
+    "train": "train",
+    "test": "test"
+  },
+  "dataset_iterator": {
+    "class_name": "huggingface_dataset_iterator",
+    "features": "text",
+    "label": "labels",
+    "seed": 42
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{BASE_MODEL}",
+        "do_lower_case": false,
+        "max_seq_length": 128,
+        "in": ["x"],
+        "out": ["bert_features"]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": ["y"],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": ["y"],
+        "out": ["y_ids"]
+      },
+      {
+        "in": ["y_ids"],
+        "out": ["y_onehot"],
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 1e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 2.0,
+        "in": ["bert_features"],
+        "in_y": ["y_ids"],
+        "out": ["y_pred_probas"]
+      },
+      {
+        "in": ["y_pred_probas"],
+        "out": ["y_pred_ids"],
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": ["y_pred_ids"],
+        "out": ["y_pred_labels"],
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": ["y_pred_labels"]
+  },
+  "train": {
+    "batch_size": 64,
+    "metrics": [
+      "accuracy",
+      "f1_weighted",
+      "f1_macro"
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": ["train", "test"],
+    "class_name": "torch_trainer",
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "pytest_max_batches": 2
+  },
+  "metadata": {
+    "variables": {
+      "BASE_MODEL": "xlm-roberta-base",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/emotions_classifier/{BASE_MODEL}"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/classifiers/emotions/emotions_xlm_roberta_base.tar.gz",
+        "subdir": "{MODEL_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/classifiers/insults_xlm_roberta_base.json b/deeppavlov/configs/classifiers/insults_xlm_roberta_base.json
@@ -0,0 +1,106 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "x": "Comment",
+    "y": "Class",
+    "data_path": "{DOWNLOADS_PATH}/insults_data"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{BASE_MODEL}",
+        "do_lower_case": false,
+        "max_seq_length": 128,
+        "in": ["x"],
+        "out": ["bert_features"]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": ["y"],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": ["y"],
+        "out": ["y_ids"]
+      },
+      {
+        "in": ["y_ids"],
+        "out": ["y_onehot"],
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 1e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 2.0,
+        "in": ["bert_features"],
+        "in_y": ["y_ids"],
+        "out": ["y_pred_probas"]
+      },
+      {
+        "in": ["y_pred_probas"],
+        "out": ["y_pred_ids"],
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": ["y_pred_ids"],
+        "out": ["y_pred_labels"],
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": ["y_pred_labels"]
+  },
+  "train": {
+    "batch_size": 256,
+    "metrics": [
+      "accuracy",
+      "f1_weighted",
+      "f1_macro"
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": ["train", "valid"],
+    "class_name": "torch_trainer",
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "pytest_max_batches": 2
+  },
+  "metadata": {
+    "variables": {
+      "BASE_MODEL": "xlm-roberta-base",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/insults_classifier/{BASE_MODEL}"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/v1/classifiers/insults/insults_xlm_roberta_base.tar.gz",
+        "subdir": "{MODEL_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/classifiers/intents_distilbert_base_multi.json b/deeppavlov/configs/classifiers/intents_distilbert_base_multi.json
@@ -0,0 +1,105 @@
+{
+  "dataset_reader": {
+    "class_name": "huggingface_dataset_reader",
+    "path": "AmazonScience/massive",
+    "name": "all",
+    "train": "train",
+    "valid": "validation",
+    "test": "test"
+  },
+  "dataset_iterator": {
+    "class_name": "huggingface_dataset_iterator",
+    "features": "utt",
+    "label": "intent",
+    "seed": 42
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{BASE_MODEL}",
+        "do_lower_case": false,
+        "max_seq_length": 128,
+        "in": ["x"],
+        "out": ["bert_features"]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": ["y"],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": ["y"],
+        "out": ["y_ids"]
+      },
+      {
+        "in": ["y_ids"],
+        "out": ["y_onehot"],
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 2e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 2.0,
+        "in": ["bert_features"],
+        "in_y": ["y_ids"],
+        "out": ["y_pred_probas"]
+      },
+      {
+        "in": ["y_pred_probas"],
+        "out": ["y_pred_ids"],
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": ["y_pred_ids"],
+        "out": ["y_pred_labels"],
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": ["y_pred_labels"]
+  },
+  "train": {
+    "batch_size": 128,
+    "metrics": [
+      "accuracy",
+      "f1_weighted"
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": ["train", "valid"],
+    "class_name": "torch_trainer",
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "pytest_max_batches": 2
+  },
+  "metadata": {
+    "variables": {
+      "BASE_MODEL": "distilbert-base-multilingual-cased",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/intents_classification/{BASE_MODEL}"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/classifiers/intents/intents_classification_distilbert_multi.tar.gz",
+        "subdir": "{MODEL_PATH}"
+      }
+    ]
+  }
+}