In [1]:
import numpy as np
import pandas as pd

import logging

# Hide logging messages.
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
%env CATALYST_LOG_LEVEL = 15
#!pip install tensorflow-macos
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel


env: CATALYST_LOG_LEVEL=15


In [2]:
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')
#set(val_df['author'].tolist()+train_df['author'].tolist())

In [40]:
bert_path = 'bert-base-uncased'


def encode(input_text, max_len):
    tokenizer = BertTokenizer.from_pretrained(bert_path)

    inputs = tokenizer.batch_encode_plus(
        input_text,
        padding='max_length',
        max_length=max_len,
        truncation=True)

    return inputs

In [41]:
#train_df.drop(['index', 'id'], axis=1, inplace=True)
max_len = max(len(x.split()) for x in train_df['text'])
#max_len

In [45]:


def create_model(max_len, num_outputs=47, embeddings=""):
    input_word_ids = tf.keras.Input(
        shape=(max_len,),
        dtype=tf.int32,
        name="input_word_ids",
    )

    bert_encoder = TFBertModel.from_pretrained(bert_path, output_hidden_states=True)
    bert_output = bert_encoder(input_word_ids)


    x = None

    if embeddings == "last_4":
        # Concat hidden states from the last 4 layers instead of just the last 1.
        bert_embeddings = tf.concat(bert_output[2][-4:], -1)
        x = tf.keras.layers.GlobalAveragePooling1D()(bert_embeddings)
    elif embeddings == "pooler":
        x = bert_output[1]
    else:
        # Hidden states from the last layer.
        bert_embeddings = bert_output[0]
        x = tf.keras.layers.GlobalAveragePooling1D()(bert_embeddings)

    x = tf.keras.layers.LayerNormalization()(x)
    output = tf.keras.layers.Dense(num_outputs, )(x)

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[
                      tf.keras.losses.BinaryCrossentropy(
                          from_logits=True, name='binary_crossentropy'),
                      'accuracy'])

    model.summary()

    return model


In [46]:
#train_input = encode(train_df['text'].values.tolist(), max_len)["input_ids"]
train_sample_x = train_df['text'].tolist()
train_y = train_df.drop('text',axis=1)
val_sample_x = val_df['text'].tolist()
inputs=encode(train_sample_x, 512)
train_x = inputs["input_ids"]
val_x = encode(val_sample_x, 512)["input_ids"]
val_y=val_df.drop('text',axis=1)
mask= inputs['attention_mask']
train_ds = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .batch(batch_size=2)
    .repeat()
    .prefetch(tf.data.experimental.AUTOTUNE)
)
val_ds = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(2)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_binary_crossentropy',
    patience=5,  # No early stopping
    restore_best_weights=True,  # Despite no early stopping, use the best weights
)
model = create_model(512)



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer)  [(None, 512)]            0         
                                                                 
 tf_bert_model_4 (TFBertMode  TFBaseModelOutputWithPoo  109482240
 l)                          lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             512, 768),                          
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=((None, 512            
                             , 768),                             
                              (None, 512, 768),                  
                              (None, 512, 768),            

In [47]:
history = model.fit(
    train_ds,
    steps_per_epoch=1,
    batch_size=2,
    epochs=5,
    verbose=1,
    shuffle=True,
    callbacks=[early_stopping],
    validation_data=val_ds,
    validation_batch_size=2,
)
train_loss_epochs = []
val_loss_epochs = []
for tl in history.history["loss"]:
    train_loss_epochs.append(tl)
for vl in history.history["val_loss"]:
    val_loss_epochs.append(vl)

Epoch 1/5








Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
from transformers import BertModel, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Freeze the weights of the first layers
for param in model.parameters():
    param.requires_grad = False
for param in model.pooler.parameters():
    param.requires_grad = True

model.pooler = torch.nn.AdaptiveMaxPool1d(47)

# Replace the classification layer with a linear layer
model.classifier = torch.nn.Sequential(
    torch.nn.Linear(model.config.hidden_size, 47)
)


# Unfreeze the weights of the last 4 layers
for param in model.encoder.layer[-4:].parameters():
    param.requires_grad = True

# Fine-tune the model on your downstream task
train_dataset = train_df
dev_dataset = val_df
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
for epoch in range(3):
    inputs = tokenizer(train_dataset['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    labels = torch.tensor(train_dataset.drop('text',axis=1).values)
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids'])
    pooled_output = outputs.pooler_output
    #print(pooled_output.shape)
    #print(labels.shape)
    logits = model.pooler(pooled_output)
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    # Evaluate the model on the dev set
    with torch.no_grad():
        inputs = tokenizer(dev_dataset['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
        labels = torch.tensor(dev_dataset.drop('text',axis=1).values)
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids'])
        pooled_output = outputs.pooler_output
        logits = model.pooler(pooled_output)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == labels).float().mean().item()
    print(f"Epoch {epoch+1}: Dev accuracy = {acc}")

# Use the fine-tuned model to make predictions on new data
test_dataset = test_df
with torch.no_grad():
    inputs = tokenizer(test_dataset['text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids'])
    pooled_output = outputs.pooler_output
    logits = model.pooler(pooled_output)
    preds = torch.argmax(logits, dim=1)
    test_dataset['pred'] = preds.tolist()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([47, 47])
Epoch 1: Dev accuracy = 0.02308736927807331
torch.Size([47, 47])
Epoch 2: Dev accuracy = 0.020823901519179344
torch.Size([47, 47])
Epoch 3: Dev accuracy = 0.023540062829852104
