In [None]:
!pip install simpletransformers 

Run the following as simpletransformers package require Nvidia APEX 

In [None]:
!git clone https://github.com/NVIDIA/apex
!cd apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /content/apex

In [None]:
from google.colab import files
files.upload()

Importing the cleaned ssoc data and converting the .csv into a pandas frame, with correct labels to be feeded into the model. We removed minor group 336 from the pandas frame as the input data only had 2 observations, which would be inadequate for subsequently spliting. We encourage the user to visualize the .csv before removing limited-observation-minor-groups. 

In [None]:
import pandas as pd
import torch
import io
import logging
import wandb
from simpletransformers.classification import ClassificationArgs, ClassificationModel
from statistics import mean, mode
from sklearn.metrics import accuracy_score

ssoc_data = pd.read_csv('ssoctrain_updated.csv',encoding='iso-8859-1')
ssoc_data = ssoc_data[["E_OCC_Desc","E_OCC"]]
ssoc_data = ssoc_data.rename(columns={'E_OCC_Desc':'text','E_OCC':'labels'})

#remove minor group 336 because only 2 entries
trunc_indices = ssoc_data.index[ssoc_data['labels'] == 336].tolist()
ssoc_data_trunc = ssoc_data.drop(trunc_indices)
dummy = dict(enumerate(sorted(ssoc_data_trunc['labels'].unique())))
dummy = {value:key for key, value in AB.items()}
new_ssoc_data_trunc = ssoc_data_trunc.replace(dummy)

We spilt the cleaned data into data for training, evaluation and testing. 

In [None]:
from sklearn.model_selection import train_test_split
y = new_ssoc_data_trunc['labels']
x = new_ssoc_data_trunc['text']
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2,random_state=1, stratify=y)
train1_data = pd.concat([train_x,train_y],axis=1)
test_data = pd.concat([test_x,test_y],axis=1)
var1 = train1_data['text']
var2 = train1_data['labels']
train_train_x, eval_train_x, train_train_y, eval_train_y = train_test_split(var1,var2,test_size = 0.25,random_state=1,stratify=var2)
train_data = pd.concat([train_train_x, train_train_y],axis=1)
eval_data = pd.concat([eval_train_x,eval_train_y],axis=1)

This is the code to test the model

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Optional model configuration
model_args = ClassificationArgs(sliding_window = True)
model_args.num_train_epochs = 5
model_args.learning_rate = 1e-4
model_args.train_batch_size = 64
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.no_save = True
###########################################
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training = True
###########################################


# Create a ClassificationModel
model = ClassificationModel(
    "electra",
    "google/electra-base-discriminator",
    # "bert",
    # "bert-base-uncased",
    num_labels=66,
    #use_cuda = False,
    args=model_args,
) 

# Train the model
model.train_model(train_data, eval_df=eval_data, accuracy=lambda truth, 
                  predictions: accuracy_score(truth, [round(p) for p in predictions]))#, output_dir = r'D:\Academia\UNI\RIPS\ELECTRA Jupyter output')

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_data,verbose = True)#, output_dir = r'D:\Academia\UNI\RIPS\ELECTRA Jupyter output')

We make use of the library ```wandb``` for visualization and finetuning the model. The following cell is to authorize wandb on this notebook. Should you desire to use wandb, you need to create an account on their webpage. 

In [None]:
wandb.login()

This code is needed to prevent errors in the subsequent fine-tuning

In [None]:
from torch.multiprocessing import Pool, Process, set_start_method
try:
     torch.multiprocessing.set_start_method('spawn', force=True)
except RuntimeError:
    pass

In [None]:
sweep_config = {
    "name":"sweep-test",
    "method": "bayes",  # grid, random
    "metric": {"name": "accuracy", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"min":1,"max":3},
        "learning_rate": {"min": 0, "max": 1e-4},
        #"train_batch_size":{"values":[8,16,32,64,128]}
    },
    #W&B Sweeps can also speed up the hyperparameter optimization by terminating any poorly performing runs
    "early_terminate": {"type": "hyperband", "min_iter": 6,}
}

sweep_id = wandb.sweep(sweep_config, project="ELECTRA_Hyperparameter_Optimization")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = ClassificationArgs()
model_args.eval_batch_size = 8
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.learning_rate = 4e-4
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 3
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False
model_args.wandb_project = "ELECTRA_Hyperparameter_Optimization"

def train():
    # Initialize a new wandb run
    wandb.init()
    print("HyperParams=>>", wandb.config.epochs)

    # Create a TransformerModel
    model = ClassificationModel(
        "electra",
        "google/electra-base-discriminator",
        use_cuda = True,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(train_data, eval_df=eval_data, 
                      accuracy=lambda truth, predictions: accuracy_score(truth, [round(p) for p in predictions]), 
    )

    # Evaluate the model
    model.eval_model(test_data,verbose=True)

    # Sync wandb
    wandb.join()


wandb.agent(sweep_id, train)

As Google allocated a limited amount of GPU, we often run into CUDA errors from running multiple ML training. As such, we force restart the notebook with the following 2 code

In [None]:
!ps -aux|grep python

Select the 3 digit number which represents the .ipynb launcher. It was 322 previously

In [None]:
!kill -9 322