deepset-ai · Timoeller · Jun 3, 2020 · May 29, 2020 · May 29, 2020 · May 29, 2020
diff --git a/examples/doc_classification.py b/examples/doc_classification.py
@@ -41,7 +41,8 @@ def doc_classifcation():
     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 
     label_list = ["OTHER", "OFFENSE"]
     metric = "f1_macro"

diff --git a/examples/doc_classification_cola.py b/examples/doc_classification_cola.py
@@ -37,7 +37,8 @@ def doc_classification_cola():
     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load Cola 2018 Data.
+    # Here we load Cola 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 
     label_list = ["0", "1"]
     metric = "mcc"

diff --git a/examples/doc_classification_crossvalidation.py b/examples/doc_classification_crossvalidation.py
@@ -78,7 +78,8 @@ def mymetrics(preds, labels):
     metric = 'mymetrics'
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 
     # The processor wants to know the possible labels ...
     label_list = ["OTHER", "OFFENSE"]

diff --git a/examples/doc_classification_custom_optimizer.py b/examples/doc_classification_custom_optimizer.py
@@ -67,7 +67,8 @@ def doc_classifcation():
         do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 
     label_list = ["OTHER", "OFFENSE"]
     metric = "f1_macro"

diff --git a/examples/doc_classification_fasttext_LM.py b/examples/doc_classification_fasttext_LM.py
@@ -55,7 +55,8 @@ def doc_classifcation():
     tokenizer = Tokenizer.load(pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case)
 
     # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
     label_list = ["OTHER", "OFFENSE"]
     metric = "f1_macro"
 

diff --git a/examples/doc_classification_multilabel.py b/examples/doc_classification_multilabel.py
@@ -40,7 +40,7 @@ def doc_classification_multilabel():
         do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load Toxic Comments Data automaticaly if it is not available.
 
     label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
     metric = "acc"

diff --git a/examples/doc_classification_multilabel_roberta.py b/examples/doc_classification_multilabel_roberta.py
@@ -41,7 +41,7 @@ def doc_classification_multilabel_roberta():
     )
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load Toxic Comments Data automaticaly if it is not available.
 
     label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
     metric = "acc"

diff --git a/examples/doc_classification_with_earlystopping.py b/examples/doc_classification_with_earlystopping.py
@@ -44,7 +44,8 @@ def doc_classification_with_earlystopping():
         do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 
     # The processor wants to know the possible labels ...
     label_list = ["OTHER", "OFFENSE"]

diff --git a/examples/doc_classification_word_embedding_LM.py b/examples/doc_classification_word_embedding_LM.py
@@ -42,7 +42,8 @@ def doc_classifcation():
     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
-    # Here we load GermEval 2018 Data.
+    # Here we load GermEval 2018 Data automaticaly if it is not available.
+    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
     label_list = ["OTHER", "OFFENSE"]
     metric = "f1_macro"
 

diff --git a/examples/evaluation.py b/examples/evaluation.py
@@ -26,6 +26,8 @@ def evaluate_classification():
         do_lower_case=do_lower_case)
 
     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
+    # Here we load GermEval 2017 Data automaticaly if it is not available.
+
     processor = TextClassificationProcessor(
         tokenizer=tokenizer,
         max_seq_len=384,
@@ -128,4 +130,4 @@ def evaluate_question_answering():
 
 if __name__ == "__main__":
     #evaluate_classification()
-    evaluate_question_answering()
+    evaluate_question_answering()
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
@@ -389,7 +389,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
         :type label_list: list
@@ -640,7 +644,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
         :type label_list: list
@@ -740,7 +748,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
         :type label_list: list
@@ -981,7 +993,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
         :type label_list: list
@@ -1167,7 +1183,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param train_filename: The name of the file containing training data.
         :type train_filename: str
@@ -1514,7 +1534,11 @@ def __init__(
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
         :type max_seq_len: int
-        :param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
+        :param data_dir: The directory in which the train and dev files can be found.
+                         If not available the dataset will be loaded automaticaly
+                         if the last directory has the same name as a predefined dataset.
+                         These predefined datasets are defined as the keys in the dict at
+                         `farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
         :type data_dir: str
         :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
         :type label_list: list

diff --git a/farm/data_handler/utils.py b/farm/data_handler/utils.py
@@ -22,7 +22,10 @@
 DOWNSTREAM_TASK_MAP = {
     "gnad": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/gnad.tar.gz",
     "germeval14": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/germeval14.tar.gz",
+
+    # only has train.tsv and test.tsv dataset - no dev.tsv
     "germeval18": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/germeval18.tar.gz",
+
     "squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
     "conll03detrain": "https://raw.githubusercontent.com/MaviccPRP/ger_ner_evals/master/corpora/conll2003/deu.train",
     "conll03dedev": "https://raw.githubusercontent.com/MaviccPRP/ger_ner_evals/master/corpora/conll2003/deu.testa", #https://www.clips.uantwerpen.be/conll2003/ner/000README says testa is dev data