Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document magic data loading in TextClassificationProcessor PR #383

Merged
merged 10 commits into from Jun 3, 2020
3 changes: 2 additions & 1 deletion examples/doc_classification.py
Expand Up @@ -41,7 +41,8 @@ def doc_classifcation():
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_cola.py
Expand Up @@ -37,7 +37,8 @@ def doc_classification_cola():
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.
# Here we load Cola 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

label_list = ["0", "1"]
metric = "mcc"
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_crossvalidation.py
Expand Up @@ -78,7 +78,8 @@ def mymetrics(preds, labels):
metric = 'mymetrics'

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_custom_optimizer.py
Expand Up @@ -67,7 +67,8 @@ def doc_classifcation():
do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_fasttext_LM.py
Expand Up @@ -55,7 +55,8 @@ def doc_classifcation():
tokenizer = Tokenizer.load(pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case)

# 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

Expand Down
2 changes: 1 addition & 1 deletion examples/doc_classification_multilabel.py
Expand Up @@ -40,7 +40,7 @@ def doc_classification_multilabel():
do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load Toxic Comments Data automaticaly if it is not available.

label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"
Expand Down
2 changes: 1 addition & 1 deletion examples/doc_classification_multilabel_roberta.py
Expand Up @@ -41,7 +41,7 @@ def doc_classification_multilabel_roberta():
)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load Toxic Comments Data automaticaly if it is not available.

label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_with_earlystopping.py
Expand Up @@ -44,7 +44,8 @@ def doc_classification_with_earlystopping():
do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
Expand Down
3 changes: 2 additions & 1 deletion examples/doc_classification_word_embedding_LM.py
Expand Up @@ -42,7 +42,8 @@ def doc_classifcation():
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation.py
Expand Up @@ -26,6 +26,8 @@ def evaluate_classification():
do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2017 Data automaticaly if it is not available.

processor = TextClassificationProcessor(
tokenizer=tokenizer,
max_seq_len=384,
Expand Down Expand Up @@ -128,4 +130,4 @@ def evaluate_question_answering():

if __name__ == "__main__":
#evaluate_classification()
evaluate_question_answering()
evaluate_question_answering()
36 changes: 30 additions & 6 deletions farm/data_handler/processor.py
Expand Up @@ -389,7 +389,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
Expand Down Expand Up @@ -640,7 +644,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
Expand Down Expand Up @@ -740,7 +748,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
Expand Down Expand Up @@ -981,7 +993,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
Expand Down Expand Up @@ -1167,7 +1183,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param train_filename: The name of the file containing training data.
:type train_filename: str
Expand Down Expand Up @@ -1514,7 +1534,11 @@ def __init__(
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found. Squad has a private test file
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/master/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
Expand Down
3 changes: 3 additions & 0 deletions farm/data_handler/utils.py
Expand Up @@ -22,7 +22,10 @@
DOWNSTREAM_TASK_MAP = {
"gnad": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/gnad.tar.gz",
"germeval14": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/germeval14.tar.gz",

# only has train.tsv and test.tsv dataset - no dev.tsv
"germeval18": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/germeval18.tar.gz",

"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
"conll03detrain": "https://raw.githubusercontent.com/MaviccPRP/ger_ner_evals/master/corpora/conll2003/deu.train",
"conll03dedev": "https://raw.githubusercontent.com/MaviccPRP/ger_ner_evals/master/corpora/conll2003/deu.testa", #https://www.clips.uantwerpen.be/conll2003/ner/000README says testa is dev data
Expand Down