Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support FARM on Windows OS #172

Merged
merged 5 commits into from
Jan 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 103 additions & 95 deletions examples/doc_classification.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# fmt: off
import logging
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
Expand All @@ -12,100 +13,107 @@
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
use_amp = None

device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/germeval18",
label_list=label_list,
metric=metric,
label_column_name="coarse_label"
)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))



model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
device=device,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
use_amp=use_amp)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device,)

# 7. Let it grow
model = trainer.train(model)

# 8. Hooray! You have a model. Store it:
save_dir = "saved_models/bert-german-doc-tutorial"
model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
basic_texts = [
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
{"text": "Martin Müller spielt Handball in Berlin"},
]
model = Inferencer.load(save_dir)
result = model.inference_from_dicts(dicts=basic_texts)
print(result)

def doc_classifcation():
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
use_amp = None

device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir=Path("../data/germeval18"),
label_list=label_list,
metric=metric,
label_column_name="coarse_label"
)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
# few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
device=device,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
use_amp=use_amp)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device)

# 7. Let it grow
model = trainer.train(model)

# 8. Hooray! You have a model. Store it:
save_dir = Path("saved_models/bert-german-doc-tutorial")
model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
basic_texts = [
{"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
{"text": "Martin Müller spielt Handball in Berlin"},
]
model = Inferencer.load(save_dir)
result = model.inference_from_dicts(dicts=basic_texts)
print(result)


if __name__ == "__main__":
doc_classifcation()

# fmt: on
Loading