In [None]:
%%capture
from class_sub_task_labelisation import SubTaskLabelisator
from class_nllfg_training import NLLFGeneratorTraining
from class_nllf_generation import NLLFGeneratorInAction
from class_nllfg_integration import NLLFIntergration

# Plug and play

## Step 1: Zero-shot Sub-task Labelisation

Create a labelisator

In [None]:
labelisator = SubTaskLabelisator(
    api_key = "<OPENAI-API-KEY>",                   # OpenAI - API Key: https://platform.openai.com/account/api-keys
    file_name_dict_bsqs = "data/dict_bsqs.json",    # File name of your JSON with BSQs
    file_name_data_train = "data/data_train.xlsx",  # File name of your .xlsx Training Dataset
    sentence_col_name = "abstract",                 # Column name of your text-to-classify
    sample_size = 100,                              # Sample size for zero-shot labelisation: Integer number or fraction between 0 to 1
    seed = 42                                       # Random seed
)

Run the labelisator

In [None]:
%%capture
labelisator.run_labeling(
    root_labels = "01_labels",  # Root folder to save the weak-labels
    temp=0,                     # Temperature of GPT-3.5-turbo
    max_t=5,                    # Max. number of output-tokens
    verbose=True                # Print status
)

## Step 2: Training of NLLF Generator

Create training

In [None]:
training = NLLFGeneratorTraining(
    file_name_dict_bsqs = "data/dict_bsqs.json",    # File name of your JSON with BSQs
    root_labels = "01_labels",                      # Root folder of the weak-labels
    sentence_col_name = "abstract",                 # Column name of your text-to-classify
    model_name = "bert-base-uncased",               # Base model name for your NLLF generator (This version: Only for BERT models from HuggingFace)
    maxlen_s=489,                                   # Max. number of tokens for your tokenize text-to-classify
    maxlen_bsq=20,                                  # Max. number of tokens for your tokenize BSQs
    batch_size=8                                    # Batch size for the training
)

Train the generator

In [None]:
training.train(
    epochs=5,       # Number of epochs for the training
    lr=2e-5,        # Learning rate for the training
    verbose=True    # Print status
)

Save trained model

In [None]:
%%capture
training.save(
    hf_token = "<HF-TOKEN>",    # Hugging Face User Access Token: https://huggingface.co/settings/tokens
    repo_name = "example_juke", # Repo. name for your NLLF generator
    username= "<HF-USERNAME>"   # Hugging Face Username
)

## Step 3.1: NLLF Generation

Prepare generator

In [None]:
%%capture
generator = NLLFGeneratorInAction(
    file_name_new_dict_bsqs = "data/new_dict_bsqs.json",    # File name of your JSON with new BSQs
    maxlen_s = 489,                                         # Max. number of tokens for your tokenize text-to-classify
    maxlen_bsq = 20,                                        # Max. number of tokens for your tokenize BSQs
    username = "<HF-USERNAME>",                             # Hugging Face Username
    repo_name = "example_juke",                             # Repo. name for your NLLF generator
    file_name_data_train = "data/data_train.xlsx",          # File name of your .xlsx Training Dataset
    file_name_data_val = "data/data_val.xlsx",              # File name of your .xlsx Validation Dataset
    file_name_data_test = "data/data_test.xlsx",            # File name of your .xlsx Testing Dataset
    sentence_col_name = "abstract"                          # Column name of your text-to-classify
)

Apply generator

In [None]:
%%capture
generator.apply(
    root_labels="02_labels",    # Root folder to save the NLL (Natural Language Learned) features
    verbose=True                # Print status
)

## Step 3.2: NLLF Integration

Prepare integrator

In [None]:
integrator = NLLFIntergration(
    root_labels = "02_labels",              # Root folder of the NLL (Natural Language Learned) features
    file_name_support = "data/support.txt", # File name of your .txt with support NLLF
    label_col_name = "label",               # Column name of your task-label
    dt_max_depth=5                          # Max. depth for the Decision Tree (DT)
)

Save DT and predictions of the integrator

In [None]:
%%capture
integrator.save_predict(
    root_labels = "03_model_predictions"    # Root folder for predictions and model parameters
)