Using the mdeberta embeddings in an automl classifier.

In [1]:
!pip install autogluon



In [2]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
from autogluon.tabular import TabularPredictor
from tqdm.auto import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/multitude_split/dataset_all.csv')

Mounted at /content/drive


In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base')
model = TFAutoModel.from_pretrained('/content/drive/MyDrive/multitude_split/ne1',  output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Some layers from the model checkpoint at /content/drive/MyDrive/multitude_split/ne1 were not used when initializing TFDebertaV2Model: ['pooler', 'cls_dropout', 'classifier']
- This IS expected if you are initializing TFDebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDebertaV2Model were initialized from the model checkpoint at /content/drive/MyDrive/multitude_split/ne1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaV2Model for predictions without further training.


In [None]:
def extract_embeddings(df, text_column='text', batch_size=32):
    # Tokenize the text and prepare DataLoader
    tokens = tokenizer(df[text_column].tolist(), max_length=300, truncation=True, padding='max_length', return_tensors="tf")

    # Initialize dataset from tensors
    dataset = tf.data.Dataset.from_tensor_slices((tokens['input_ids'], tokens['attention_mask']))
    dataset = dataset.batch(batch_size)

    # Extract embeddings with progress bar
    embeddings = []
    for batch in tqdm(dataset, desc="Extracting embeddings"):
        input_ids, attention_mask = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states
        batch_embeddings = tf.reduce_mean(hidden_states[-1], axis=1)  # Mean pooling
        embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    embeddings = tf.concat(embeddings, axis=0)
    return embeddings.numpy()

In [4]:
train = data[data.split == "train"]
test = data[data.split == "test"]

In [None]:
train_embeddings = extract_embeddings(train, 'text')
test_embeddings = extract_embeddings(test, 'text')

Extracting embeddings:   0%|          | 0/1400 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/916 [00:00<?, ?it/s]

In [None]:
print(train_embeddings.shape)
print(test_embeddings.shape)
print(train_embeddings.dtype)
print(test_embeddings.dtype)

In [None]:
# save the embeddings
import pickle
with open('/content/drive/MyDrive/multitude_split/train_embeddingsmdeberta.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)
with open('/content/drive/MyDrive/multitude_split/test_embeddingsmdeberta.pkl', 'wb') as f:
    pickle.dump(test_embeddings, f)

In [5]:
# load the embeddings from saved file
import pickle
with open('/content/drive/MyDrive/multitude_split/train_embeddingsmdeberta.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/multitude_split/test_embeddingsmdeberta.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

In [6]:
print(train_embeddings.shape)
print(test_embeddings.shape)
print(train_embeddings.dtype)
print(test_embeddings.dtype)

(44786, 768)
(29295, 768)
float32
float32


In [7]:
# Convert embeddings into DataFrame
train_embeddings_df = pd.DataFrame(train_embeddings).reset_index(drop=True)
test_embeddings_df = pd.DataFrame(test_embeddings).reset_index(drop=True)

In [8]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [9]:
# Add labels to the embeddings DataFrame
train_embeddings_df['label'] = train['label']
test_embeddings_df['label'] = test['label']

In [10]:
# count the number of values on each label
label_counts = train_embeddings_df['label'].value_counts()
print(label_counts)

label
1    40030
0     4756
Name: count, dtype: int64


In [11]:
# Save path for the AutoGluon models
save_path = 'autogluon_models'

# AutoGluon Tabular Predictor
label_column = 'label'
predictor = TabularPredictor(label='label', path=save_path, eval_metric='f1_macro',
                             problem_type='binary',  # Adjust according to your specific problem, e.g., 'binary', 'multiclass'
                             verbosity=2)


In [12]:
train_embeddings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44786 entries, 0 to 44785
Columns: 769 entries, 0 to label
dtypes: float32(768), int64(1)
memory usage: 131.6 MB


In [None]:
predictor.fit(train_data=train_embeddings_df, presets='best_quality', ag_args_fit={'num_gpus': 1})

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Apr 28 14:29:16 UTC 2024
CPU Count:          8
Memory Avail:       47.95 GB / 50.99 GB (94.0%)
Disk Space Avail:   169.44 GB / 201.23 GB (84.2%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x79295c119fc0>

In [None]:
# Evaluate and print the leaderboard
performance = predictor.evaluate(test_embeddings_df, auxiliary_metrics=True)
print("Model performance on test data:", performance)

Model performance on test data: {'f1_macro': 0.8898478073696339, 'accuracy': 0.9575695511179382, 'balanced_accuracy': 0.8820997372084863, 'mcc': 0.7799747464262567, 'roc_auc': 0.9745834379064983, 'f1': 0.9762173538697024, 'precision': 0.973479355872701, 'recall': 0.9789707970374918}


In [None]:
leaderboard = predictor.leaderboard(
    test_embeddings_df,
    silent=True,
    extra_metrics=['accuracy', 'roc_auc', 'f1_macro', 'f1_weighted']
)
print(leaderboard)

                      model  score_test  accuracy   roc_auc  f1_macro  \
0         LightGBMXT_BAG_L2    0.890542  0.958559  0.977833  0.890542   
1   RandomForestGini_BAG_L2    0.890231  0.958389  0.965193  0.890231   
2       WeightedEnsemble_L2    0.889848  0.957570  0.974583  0.889848   
3       WeightedEnsemble_L3    0.889848  0.957570  0.974583  0.889848   
4     ExtraTreesGini_BAG_L2    0.889456  0.958047  0.969004  0.889456   
5    NeuralNetFastAI_BAG_L1    0.889385  0.957296  0.971171  0.889385   
6   RandomForestEntr_BAG_L2    0.889296  0.957945  0.968951  0.889296   
7           LightGBM_BAG_L1    0.889237  0.958047  0.975860  0.889237   
8         LightGBMXT_BAG_L1    0.888820  0.957740  0.978194  0.888820   
9           CatBoost_BAG_L2    0.888661  0.957638  0.974787  0.888661   
10          LightGBM_BAG_L2    0.887176  0.956819  0.970272  0.887176   
11          CatBoost_BAG_L1    0.885292  0.957023  0.977543  0.885292   
12  RandomForestEntr_BAG_L1    0.883874  0.956204  

In [None]:
best_model = predictor.get_model_best()


  best_model = predictor.get_model_best()


In [None]:
print("Best model according to primary metric (default is accuracy):", best_model)


Best model according to primary metric (default is accuracy): WeightedEnsemble_L2


In [None]:
model_name = leaderboard.iloc[0]['model']
print("Selected model name:", model_name)

Selected model name: LightGBMXT_BAG_L2


In [None]:
predictions = predictor.predict(test_embeddings_df.drop(columns=['label']), model='WeightedEnsemble_L2')
predictions_prob = predictor.predict_proba(test_embeddings_df.drop(columns=['label']), model='WeightedEnsemble_L2')

In [None]:

true_labels = true_labels.tolist()

NameError: name 'true_labels' is not defined

In [None]:
pred_prob = []
for i in range(len(predictions_prob)):
  pred_prob.append(predictions_prob[true_labels[i]][i])

In [None]:
print(pred_prob)


In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
true_labels = test_embeddings_df['label']
cr = classification_report(true_labels, predictions, digits=4, zero_division=0)
cm = confusion_matrix(true_labels, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# Fall out or false positive rate
FPR = FP/(FP+TN) if (FP+TN) > 0 else 0
# False negative rate
FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

roc = roc_auc_score(true_labels, predictions)
roc_prob = roc_auc_score(true_labels, pred_prob)

print(cm)
print(cr)
print(f"FPR: {FPR}")
print(f"FNR: {FNR}")
print(f"ROC: {roc}")
print(f"ROC_prob: {roc_prob}")

In [None]:
from sklearn.metrics import classification_report

# Ground truth labels
true_labels = test_embeddings_df['label']

# Generate the classification report
report = classification_report(true_labels, predictions, digits=4, zero_division=0)  # Replace `your_target_names` with actual class names
print("Classification Report:\n", report)

In [None]:
# roc auc score
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(true_labels, predictions)
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.8746356738922434


In [None]:
predictor.info()

{'path': 'autogluon_models',
 'label': 'label',
 'random_state': 0,
 'version': '1.1.1',
 'features': ['0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  '23',
  '24',
  '25',
  '26',
  '27',
  '28',
  '29',
  '30',
  '31',
  '32',
  '33',
  '34',
  '35',
  '36',
  '37',
  '38',
  '39',
  '40',
  '41',
  '42',
  '43',
  '44',
  '45',
  '46',
  '47',
  '48',
  '49',
  '50',
  '51',
  '52',
  '53',
  '54',
  '55',
  '56',
  '57',
  '58',
  '59',
  '60',
  '61',
  '62',
  '63',
  '64',
  '65',
  '66',
  '67',
  '68',
  '69',
  '70',
  '71',
  '72',
  '73',
  '74',
  '75',
  '76',
  '77',
  '78',
  '79',
  '80',
  '81',
  '82',
  '83',
  '84',
  '85',
  '86',
  '87',
  '88',
  '89',
  '90',
  '91',
  '92',
  '93',
  '94',
  '95',
  '96',
  '97',
  '98',
  '99',
  '100',
  '101',
  '102',
  '103',
  '104',
  '105',
  '106',
  '107',
  '108',
  '109',
  '110',
  '111',
 

In [None]:
# Get all information available for the predictor
info = predictor.info()

# Retrieve detailed model information
model_details = info['model_info'][model_name]

# Print detailed model info and hyperparameters
print("Model Details:", model_details)
print("Hyperparameters of the model:", model_details['hyperparameters'])

Model Details: {'name': 'RandomForestGini_BAG_L2', 'model_type': 'StackerEnsembleModel', 'problem_type': 'binary', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 463.8261682987213, 'num_classes': 2, 'quantile_levels': None, 'predict_time': 14.032022714614868, 'val_score': 0.9941276291698299, 'hyperparameters': {'use_orig_features': True, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True, 'use_child_oof': True}, 'hyperparameters_fit': {}, 'hyperparameters_nondefault': ['use_child_oof'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}, 'num_features': 774, 'features': ['387', '548', '262', '342', '348', '5