# Predicting Success: Machine Learning Models for CBT Outcomes

# Introduction

This study aims to utilise machine learning techniques to predict the likelihood of success in Cognitive Behavioral Therapy (CBT) using data. The dataset encompasses various demographic, treatment-related, and psychological assessment variables, offering insights into patient profiles and treatment outcomes. By leveraging various analytical skills, predictive models will aid in revealing the success probability for CBT in individual patients.

Contents

1.   Walk through
2.   Traditional Machine Learning
3.   Large Language Model fine-tuning




In [None]:
from google.colab import drive
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

drive.mount('/content/drive')
notebook_path = '/content/drive/MyDrive/Colab Notebooks/'
sys.path.append(notebook_path)
import cleaning_functions as cleanf
import eda_functions as edaf
import preparation_functions as prepf
import modelling_functions as modelf

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Large Language Models

## Preparation

In [None]:
# create different processed datasets
data, raw_df = cleanf.load_data()
data = cleanf.Clean_Data(data)

prep1_df = prepf.Prepare_Data(
    df=data,
    quasi_thresh=0.999,
    corr_thresh=0.999,
    load_matrices=True,
    col_thresh=0.9,
    row_thresh=0.9,
    scale_method='normalise',
    imputation_method='iterative',
    #n_neighbours=3,
    max_iter=10,
    k_features=200)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


KeyboardInterrupt: 

** **Convert targets to integer, fix imputed values** **

In [None]:
cleanf.manage_data(prep1_df, 'prep1_df', 'save')
temp = cleanf.manage_data(None, 'prep1_df', 'load')

Saved!


## Simple Transformers Hyperparameter-Tuning

In [None]:
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

def BERT_ModelSelection(df, target, train_args, k=5):

    def serialize_data(df, target):
        EndDesc_cols = [col for col in df.columns if 'EndDesc' in col]
        explanatory_df = df.drop(['ReliableChangeDesc', 'ReliableRecovery', 'Recovery'] + EndDesc_cols, axis=1)
        text = explanatory_df.apply(lambda row: row.to_json(), axis=1)
        text_df = pd.DataFrame({
            'text': text,
            'label': df[target]})
        text_df = text_df.dropna()
        return text_df

    text_df = serialize_data(df, target)
    X = text_df['text'].tolist()
    y = text_df['label'].tolist()
    scores = []

    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(X, y):

        X_train = [X[i] for i in train_index]
        X_test = [X[i] for i in test_index]
        y_train = [y[i] for i in train_index]
        y_test = [y[i] for i in test_index]

        model = ClassificationModel(
            "bert", "distilbert-base-cased",
            num_labels=2,
            use_cuda=False,
            args=train_args)

        # Grid search for hyperparameter tuning
        grid_search = GridSearchCV(estimator=model, param_grid=train_args, cv=k, scoring='accuracy', verbose=2, n_jobs=-1)
        grid_search.fit(pd.DataFrame({'text': X_train, 'label': y_train}))

        # Best model from grid search
        best_model = grid_search.best_estimator_

        # Evaluate the best model
        result, model_outputs, wrong_predictions = best_model.eval_model(pd.DataFrame({'text': X_test, 'label': y_test}), acc=accuracy_score)

        # Append accuracy score to the list
        scores.append(result['acc'])
        print(result['acc'])
        print('--------------------------------------------------')

    return scores


In [None]:
train_args = {
    "num_train_epochs": [3, 4, 5],
    "learning_rate": [1e-5, 2e-5, 3e-5],
    "train_batch_size": [8, 16, 32],
    "max_seq_length": [128, 256],}

BERT_ModelSelection(prep1_df, 'Recovery', train_args, k=5)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encod

TypeError: estimator should be an estimator implementing 'fit' method, <simpletransformers.classification.classification_model.ClassificationModel object at 0x7dfedad82740> was passed

## Simple Tranformers

In [None]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/316.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpl

In [None]:
# dataset
def serialise_data(df, target):
    EndDesc_cols = [col for col in df.columns if 'EndDesc' in col]
    explanatory_df = df.drop(['ReliableChangeDesc', 'ReliableRecovery', 'Recovery'] + EndDesc_cols, axis = 1)
    text = explanatory_df.apply(lambda row: row.to_json(), axis=1)
    text_df = pd.DataFrame({
        'text': text,
        'label': df[target]})
    text_df = text_df.dropna()

    return text_df

text_df = serialise_data(prep1_df, 'Recovery')

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(text_df, test_size=0.10)

from simpletransformers.classification import ClassificationModel
# define hyperparameter
train_args ={"reprocess_input_data": True,
             'learning_rate': 1e-5,
             "fp16":False,
             "num_train_epochs": 4,
             "overwrite_output_dir": True} # avoid existing outpit directory error

# ClassificationModel
model = ClassificationModel(
    "bert", "distilbert-base-cased",
    num_labels=2,
    use_cuda=False,
    args=train_args)

model.train_model(train_df)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encod

0it [00:00, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 4 of 4:   0%|          | 0/44 [00:00<?, ?it/s]

(176, 0.7087173216383565)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)

  self.pid = os.fork()


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
result

{'mcc': 0.0,
 'accuracy': 0.475,
 'f1_score': 0.3220338983050847,
 'tp': 0,
 'tn': 19,
 'fp': 0,
 'fn': 21,
 'auroc': 0.47994987468671685,
 'auprc': 0.515092981496282,
 'acc': 0.475,
 'eval_loss': 0.7218290567398071}

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(text_df, test_size=0.10)

from simpletransformers.classification import ClassificationModel
# define hyperparameter
train_args ={"reprocess_input_data": True,
             'learning_rate': 5e-5,
             'train_batch_size': 4,
             "fp16":False,
             "num_train_epochs": 4,
             "overwrite_output_dir": True} # avoid existing outpit directory error

# ClassificationModel
model = ClassificationModel(
    "bert", "distilbert-base-cased",
    num_labels=2,
    use_cuda=False,
    args=train_args)

model.train_model(train_df)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encod

0it [00:00, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/88 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/88 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/88 [00:00<?, ?it/s]

Running Epoch 4 of 4:   0%|          | 0/88 [00:00<?, ?it/s]

(352, 0.7284841140393506)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)
result

  self.pid = os.fork()


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'mcc': 0.0,
 'accuracy': 0.5,
 'f1_score': 0.3333333333333333,
 'tp': 0,
 'tn': 20,
 'fp': 0,
 'fn': 20,
 'auroc': 0.47250000000000003,
 'auprc': 0.5144719478948154,
 'acc': 0.5,
 'eval_loss': 0.7050856947898865}

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(text_df, test_size=0.10)

from simpletransformers.classification import ClassificationModel
# define hyperparameter
train_args ={"reprocess_input_data": True,
             'learning_rate': 5e-6,
             'train_batch_size': 8,
             "fp16":False,
             "num_train_epochs": 5,
             "overwrite_output_dir": True} # avoid existing outpit directory error

# ClassificationModel
model = ClassificationModel(
    "bert", "distilbert-base-cased",
    num_labels=2,
    use_cuda=False,
    args=train_args)

model.train_model(train_df)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encod

0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

(220, 0.6997420394962485)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)
result

  self.pid = os.fork()


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'mcc': 0.0,
 'accuracy': 0.475,
 'f1_score': 0.3220338983050847,
 'tp': 0,
 'tn': 19,
 'fp': 0,
 'fn': 21,
 'auroc': 0.7719298245614036,
 'auprc': 0.8139943572887346,
 'acc': 0.475,
 'eval_loss': 0.7058476209640503}

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(text_df, test_size=0.10)

from simpletransformers.classification import ClassificationModel
# define hyperparameter
train_args ={"reprocess_input_data": True,
             'learning_rate': 5e-6,
             'train_batch_size': 8,
             'max_seq_length': 32,
             "fp16":False,
             "num_train_epochs": 5,
             "overwrite_output_dir": True} # avoid existing outpit directory error

# ClassificationModel
model = ClassificationModel(
    "bert", "distilbert-base-cased",
    num_labels=2,
    use_cuda=False,
    args=train_args)

model.train_model(train_df)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encod

0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/44 [00:00<?, ?it/s]

(220, 0.70657849826596)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)
result

  self.pid = os.fork()


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'mcc': 0.0,
 'accuracy': 0.55,
 'f1_score': 0.3548387096774194,
 'tp': 0,
 'tn': 22,
 'fp': 0,
 'fn': 18,
 'auroc': 0.5808080808080809,
 'auprc': 0.518851094026699,
 'acc': 0.55,
 'eval_loss': 0.6867173910140991}

In [None]:
temp.to_string()

'     Recovery  ReliableRecovery  ReliableChangeDesc  EndDesc_Mutually agreed completion of treatment  EndDesc_Not suitable for IAPT service - no action taken or directed back to referrer  EndDesc_Referred to another therapy service by mutual agreement  EndDesc_Termition of treatment earlier than Care Professiol planned   Item124     Item82    Item156    Item211    Item81     Gender    Item38    Item89       Item151   Item108    Item57  EthnicCodeShort    Item12       Item150    Item125    Item90    Item136   Total16    Item163    Item31    Item202   Item115    Item91     Item61    Item42    Item213  AgeAtReferralRequest    Item85   Item133    Item144    Item56   Item190    Item75   Item109    Item98    Item128  Threshold5   Total15    Item205     Item36   Item186    Item10    Item86       Item155    Item22    Item16   Item103     Item1  Threshold1     Item9  CareContacts   Item102      Item2    Item41   Item195    Item11    CaseID    Item19    Item83   Item107     Item203    Item13   

## Cross-Validation BERT

In [None]:
!pip install datasets;
!pip install evaluate;
!pip install -U accelerate;
!pip install -U transformers;

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer
from transformers import GPT2ForSequenceClassification
import evaluate
from transformers import TrainingArguments, Trainer

In [None]:
def BERT_Train(df, target='Recovery', k=5):

    # dataset
    EndDesc_cols = [col for col in df.columns if 'EndDesc' in col]
    explanatory_df = df.drop(['ReliableChangeDesc', 'ReliableRecovery', 'Recovery'] + EndDesc_cols, axis = 1)
    text = explanatory_df.apply(lambda row: row.to_json(), axis=1)
    text_df = pd.DataFrame({
        'text': text,
        'label': df[target]})
    text_df = text_df.dropna()
    dataset = Dataset.from_pandas(text_df)

    # tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.pad_token = tokenizer.eos_token
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    # evaluation
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # hyperparameters
    training_args = TrainingArguments(
        output_dir="test_trainer",
        #evaluation_strategy="epoch",
        per_device_train_batch_size=1,  # Reduce batch size here
        per_device_eval_batch_size=1,   # Optionally, reduce for evaluation as well
        gradient_accumulation_steps=4)

    # cross validation
    cv = StratifiedKFold(n_splits=k, shuffle=True)
    splits = cv.split(text_df['text'], text_df['label'])
    for fold, (train_index, test_index) in enumerate(splits):

        # train and test sets
        train_split = tokenized_datasets.select(train_index.tolist())
        test_split = tokenized_datasets.select(test_index.tolist())

        # trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_set,
            eval_dataset=test_set,
            compute_metrics=compute_metrics,)

        trainer.train()

        trainer.evaluate()

    print("--------------------------------------------------")


In [None]:
BERT_Train(prep1_df, target='Recovery', k=5)

## Individual GPT2 and BERT

In [None]:
!pip install datasets;
!pip install evaluate;
!pip install -U accelerate;
!pip install -U transformers;

In [None]:
EndDesc_cols = [col for col in prep1_df.columns if 'EndDesc' in col]
explanatory_df = prep1_df.drop(['ReliableChangeDesc', 'ReliableRecovery', 'Recovery'] + EndDesc_cols, axis = 1)

text = explanatory_df.apply(lambda row: row.to_json(), axis=1)
text_df = pd.DataFrame({
    'text': text,
    'label': prep1_df['Recovery']})

text_df = text_df.dropna()

#!pip install datasets;

from datasets import Dataset, DatasetDict
import pandas as pd

dataset = Dataset.from_pandas(text_df)

train_size = int((text_df.shape[0]) * 0.8)
train_dataset = dataset.select(range(train_size))
test_dataset = dataset.select(range(train_size, len(dataset)))

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset})

In [None]:
### Tokenizer ###

from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# small train and evaluation sets

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(79Z))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(79))

### Initialise base model ###

from transformers import GPT2ForSequenceClassification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

### Evaluate method ###

#!pip install evaluate;
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

### Fine-tune (Trainer method) ###

#!pip install -U accelerate;
#!pip install -U transformers;
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,   # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# trainer.save_model("test_trainer")
# tokenizer.save_pretrained("test_trainer")

In [None]:
# tokenize data
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# small train and evaluation sets
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(50))

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# !pip install evaluate
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# !pip install -U accelerate transformers
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=1,  # Reduce batch size here
    per_device_eval_batch_size=1,   # Optionally, reduce for evaluation as well
    gradient_accumulation_steps=4)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
pip install optuna

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import optuna
import evaluate

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Create small training and evaluation datasets
small_train_dataset = tokenized_datasets["train"].shuffle(seed=2001).select(range(50))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2001).select(range(50))

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Metric for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Function to initialize the model
def model_init():
    return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments template
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# Define hyperparameter search space
def hyperparameter_search_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [1, 2, 4])
    }

# Conduct hyperparameter search
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=10,
    compute_objective=lambda metrics: metrics["eval_accuracy"],
    hp_space=hyperparameter_search_space
)

print("Best Hyperparameters:\n", best_trial.hyperparameters)


## ChatGPT Hyperparameter-Tuning

** **Consider whether order of features matters - its sequential** **

In [None]:
bert_param_grid = {
    'classifier__max_length': [128, 256, 512]
    # Add more hyperparameters if needed
}

bert1_model = BERT_ModelSelection(serialised_df, 'serialised_data', 'Recovery', bert_param_grid, k=5)

In [None]:
ModelSelection_Summary(bert1_model)