Tensorflow implementation of the gated architecture.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/multitude_split/dataset_all.csv')
data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,text,label,multi_label,split,language,length,source,word_count,unique_word_count,char_count,...,question_mark_count,exclamation_mark_count,flesch_reading_ease,gunning_fog_index,first_person_pronoun_count,person_entity_count,date_entity_count,uniqueness_bigram,uniqueness_trigram,syntax_variety
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel,199.0,118.0,1067.0,...,0.0,0.0,-272.02217,11.15603,0.0,0.0,0.0,0.90404,0.979695,12.0
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews,70.0,54.0,311.0,...,0.0,1.0,-186.793214,8.714286,0.0,5.0,2.0,1.0,1.0,11.0
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax,130.0,82.0,691.0,...,0.0,0.0,-269.236538,11.015385,0.0,0.0,0.0,0.860465,0.929688,14.0
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews,292.0,149.0,1419.0,...,0.0,0.0,-231.229869,11.4401,1.0,1.0,1.0,0.876289,0.965517,13.0
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews,476.0,242.0,2259.0,...,0.0,0.0,-224.855788,13.160504,1.0,2.0,2.0,0.871579,0.974684,15.0


In [2]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

statistical_features = ['word_count', 'unique_word_count', 'char_count', 'avg_word_length',
       'ttr', 'hapax_legomenon', 'sentence_count', 'avg_sentence_length',
       'avg_sentence_complexity', 'punctuation_count', 'noun_count',
       'stopword_count', 'verb_count', 'adj_count', 'adv_count',
       'complex_sentence_count', 'question_mark_count',
       'exclamation_mark_count', 'flesch_reading_ease', 'gunning_fog_index',
       'first_person_pronoun_count', 'person_entity_count',
       'date_entity_count', 'uniqueness_bigram', 'uniqueness_trigram',
       'syntax_variety']

data[statistical_features] = scaler.fit_transform(data[statistical_features])

In [3]:
data['numerical'] = data[statistical_features].apply(lambda row: row.tolist(), axis=1)
data = data.drop(columns=statistical_features)
data.head()

Unnamed: 0,text,label,multi_label,split,language,length,source,numerical
0,Der Ausbruch des Coronavirus hat die Entwicklu...,1,text-davinci-003,test,de,174,MULTITuDE_MassiveSumm_spiegel,"[0.3060278207109737, 0.3556231003039514, 0.361..."
1,Alex Azar was officially sworn in as the U.S. ...,1,text-davinci-003,train,en,57,MULTITuDE_MassiveSumm_voanews,"[0.10664605873261206, 0.16109422492401215, 0.1..."
2,Європейський союз вимагає зупинити розтрату ко...,1,gpt-3.5-turbo,test,uk,105,MULTITuDE_MassiveSumm_interfax,"[0.19938176197836166, 0.24620060790273557, 0.2..."
3,"Yesterday, hundreds of Zambian university stud...",1,text-davinci-003,train,en,254,MULTITuDE_MassiveSumm_voanews,"[0.4497681607418856, 0.44984802431610943, 0.48..."
4,"In a narrow and highly watched vote, the US Se...",1,gpt-4,train,en,416,MULTITuDE_MassiveSumm_voanews,"[0.7341576506955177, 0.7325227963525837, 0.767..."


In [4]:
data_test = data[data["split"] == "test"]
data_val = data[data["split"] == "test"]
# select 100 samples from each language following same label distribution
data_val = data_val.groupby("language").apply(lambda x: x.sample(100)).reset_index(drop=True)
data_train = data[data["split"] == "train"]

In [5]:
# import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base')
text_model = TFAutoModel.from_pretrained('/content/drive/MyDrive/multitude_split/ne1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some layers from the model checkpoint at /content/drive/MyDrive/multitude_split/ne1 were not used when initializing TFDebertaV2Model: ['classifier', 'pooler', 'cls_dropout']
- This IS expected if you are initializing TFDebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificatio

In [6]:
# Assuming text_model is the transformer part of your model
for layer in text_model.layers:
    layer.trainable = False

In [8]:
from tqdm.auto import tqdm
import numpy as np
import tensorflow as tf

# Function to tokenize and prepare data
def tokenize_and_prepare_data(text, numerical, label):
    # Tokenize text
    tokens = tokenizer(text, max_length=300, padding='max_length', truncation=True, return_tensors='np')
    # Convert numerical data and label into tensors
    numerical_data = tf.convert_to_tensor(numerical, dtype=tf.float32)
    label_data = tf.convert_to_tensor(label, dtype=tf.float32)
    return tokens['input_ids'], tokens['attention_mask'], numerical_data, label_data

# Function to process the DataFrame and add a progress bar
def create_dataset(df):
    inputs_ids, attention_masks, numericals, labels = [], [], [], []
    # Process each row in the DataFrame with progress bar
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Tokenizing and preparing data"):
        input_id, attention_mask, numerical, label = tokenize_and_prepare_data(row['text'], row['numerical'], row['label'])
        inputs_ids.append(input_id)
        attention_masks.append(attention_mask)
        numericals.append(numerical)
        labels.append(label)
    # Create TensorFlow dataset
    return tf.data.Dataset.from_tensor_slices(({
        'input_ids': np.vstack(inputs_ids),
        'attention_mask': np.vstack(attention_masks),
        'numerical': np.vstack(numericals)
    }, np.array(labels)))

# # Example of how to use it with a DataFrame
# train_dataset = create_dataset(data_train)
# val_dataset = create_dataset(data_val)
test_dataset = create_dataset(data_test)


Tokenizing and preparing data:   0%|          | 0/29295 [00:00<?, ?it/s]

In [9]:
import tensorflow as tf
# Define input layers
text_input = tf.keras.layers.Input(shape=(300,), dtype=tf.int32, name='input_ids')
attention_mask_input = tf.keras.layers.Input(shape=(300,), dtype=tf.int32, name='attention_mask')
numerical_input = tf.keras.layers.Input(shape=(26,), dtype=tf.float32, name='numerical')

# Processing text input through the transformer
x_text = text_model({'input_ids': text_input, 'attention_mask': attention_mask_input})[0]
x_text = tf.keras.layers.GlobalAveragePooling1D()(x_text)

# Numerical input processing
x_numerical = tf.keras.layers.Dense(128, activation='relu')(numerical_input)
x_numerical = tf.keras.layers.Dense(64, activation='relu')(x_numerical)

# Concatenate and classification layer
concatenated = tf.keras.layers.Concatenate()([x_text, x_numerical])

gating_weights = tf.keras.layers.Dense(concatenated.shape[-1], activation='sigmoid')(concatenated)
gated_features = tf.keras.layers.Multiply()([concatenated, gating_weights])
x = tf.keras.layers.Dense(64, activation='relu')(concatenated)
x = tf.keras.layers.Dense(32, activation='relu')(x)
x = tf.keras.layers.Dense(16, activation='relu')(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

In [10]:
# Build and compile the model
model = tf.keras.Model(inputs=[text_input, attention_mask_input, numerical_input], outputs=output)
model.load_weights("/content/drive/MyDrive/multitude_split/ne4")
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7e8377fc3460>

In [11]:
# prompt: print classification report
print('classification report for gated mode')
from sklearn.metrics import classification_report

# Get predicted labels
pred_labels = model.predict(test_dataset.batch(128))

# Convert predicted labels to binary values
pred_labels = np.where(pred_labels > 0.5, 1, 0)

# Get true labels
true_labels = data_test['label'].values

# Print classification report
print(classification_report(true_labels, pred_labels))

classification report for gated mode
              precision    recall  f1-score   support

           0       0.87      0.53      0.66      3236
           1       0.94      0.99      0.97     26059

    accuracy                           0.94     29295
   macro avg       0.91      0.76      0.81     29295
weighted avg       0.94      0.94      0.93     29295



In [None]:
# Train the model
history = model.fit(
    train_dataset.batch(128).prefetch(tf.data.AUTOTUNE),
    validation_data=val_dataset.batch(128).prefetch(tf.data.AUTOTUNE),
    epochs=3  # You can adjust the number of epochs based on your observations of training and validation performance
)

Epoch 1/3


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Epoch 2/3
Epoch 3/3


In [None]:
# prompt: save the model

model.save("/content/drive/MyDrive/multitude_split/ne4")

In [None]:
import matplotlib.pyplot as plt

# Plotting the training and validation loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plotting the training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


In [None]:
# prompt: print classification report

from sklearn.metrics import classification_report

# Get predicted labels
pred_labels = model.predict(test_dataset.batch(128))

# Convert predicted labels to binary values
pred_labels = np.where(pred_labels > 0.5, 1, 0)

# Get true labels
true_labels = data_test['label'].values

# Print classification report
print(classification_report(true_labels, pred_labels))


              precision    recall  f1-score   support

           0       0.87      0.53      0.66      3236
           1       0.94      0.99      0.97     26059

    accuracy                           0.94     29295
   macro avg       0.91      0.76      0.81     29295
weighted avg       0.94      0.94      0.93     29295



In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(true_labels, pred_labels)

0.7597666849558037

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(true_labels, pred_labels)

TN = cm[0, 0]
FP = cm[0, 1]
FPR = FP / (FP + TN)

In [None]:
FPR

0.47064276885043266