from https://www.kaggle.com/code/imvision12/tensorflow-feedback-bert-baseline/notebook

# imports

In [1]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel
import transformers


# inputs

In [2]:
df = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [3]:
df['discourse_type'].unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

# configuration

In [4]:
AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 256

# Bert encoder

In [5]:
def bert_encode(texts, tokenizer, max_len=MAX_LEN):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

# loading bert tokenizer

In [6]:

tokenizer = transformers.BertTokenizer.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased')
tokenizer.save_pretrained('.')

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json')

In [7]:
sep = tokenizer.sep_token
sep

'[SEP]'

# adding discourse type to the input

In [8]:
df['inputs'] = df.discourse_type + sep +df.discourse_text

In [9]:
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,Counterclaim[SEP]People thought that the face ...


# changing dicourse effectiveness label from text to label

In [10]:
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}
df = df.replace(new_label)
df = df.rename(columns = {"discourse_effectiveness": "label"})

In [11]:
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,label,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1,Counterclaim[SEP]People thought that the face ...


# test train split

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df['inputs'], df['label'], test_size=0.12, random_state=42)

# bert encoding train and test data 

since bert takes inputs in its own way

In [13]:
X_train = bert_encode(X_train.astype(str), tokenizer)
X_valid = bert_encode(X_valid.astype(str), tokenizer)

y_train = y_train.values
y_valid = y_valid.values

In [14]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

2022-06-09 01:08:09.206591: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 01:08:09.207653: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 01:08:09.208398: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-09 01:08:09.210682: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

# building model

In [15]:
def build_model(bert_model, max_len=MAX_LEN):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    clf_output = sequence_output[:, 0, :]
    clf_output = Dropout(.1)(clf_output)
    out = Dense(3, activation='softmax')(clf_output)
    
    model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [16]:
%%time
transformer_layer = (TFBertModel.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'))
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Some layers from the model checkpoint at ../input/huggingface-bert-variants/bert-base-cased/bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ../input/huggingface-bert-variants/bert-base-cased/bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
  "The `lr` argument is deprecated, use `learning_rate` instead."

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]         

# training

In [17]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=200,
    validation_data=valid_dataset,
    epochs=5
)

Epoch 1/5


2022-06-09 01:08:38.415291: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# getting the text data

In [18]:
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
test['text'] = test.discourse_type + sep +test.discourse_text
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,Lead[SEP]Making choices in life can be very di...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,Position[SEP]Seeking multiple opinions can hel...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,Claim[SEP]it can decrease stress levels
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,Claim[SEP]a great chance to learn something new
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,Claim[SEP]can be very helpful and beneficial.


# readying for test predictions

In [19]:
test_text = bert_encode(test.text.astype(str), tokenizer)

In [20]:
sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sub.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.2,0.6,0.4
1,5a88900e7dc1,3.0,6.0,1.0
2,9790d835736b,1.0,2.0,3.0
3,75ce6d68b67b,0.33,0.34,0.33
4,93578d946723,0.01,0.24,0.47


# prediction

In [21]:
preds = model.predict(test_text, verbose=1)
preds



array([[0.03153076, 0.6097489 , 0.3587203 ],
       [0.02448042, 0.59593344, 0.3795861 ],
       [0.03012707, 0.6410126 , 0.3288603 ],
       [0.08343229, 0.69677824, 0.21978942],
       [0.0515845 , 0.70249474, 0.24592078],
       [0.05449399, 0.54054034, 0.40496573],
       [0.05874411, 0.53515947, 0.40609637],
       [0.03773963, 0.58636755, 0.37589288],
       [0.10370583, 0.60517573, 0.29111844],
       [0.0234286 , 0.6395666 , 0.33700478]], dtype=float32)

In [22]:
sub['Ineffective'] = preds[:,0]
sub['Adequate'] = preds[:,1]
sub['Effective'] = preds[:,2]
sub

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.031531,0.609749,0.35872
1,5a88900e7dc1,0.02448,0.595933,0.379586
2,9790d835736b,0.030127,0.641013,0.32886
3,75ce6d68b67b,0.083432,0.696778,0.219789
4,93578d946723,0.051584,0.702495,0.245921
5,2e214524dbe3,0.054494,0.54054,0.404966
6,84812fc2ab9f,0.058744,0.535159,0.406096
7,c668ff840720,0.03774,0.586368,0.375893
8,739a6d00f44a,0.103706,0.605176,0.291118
9,bcfae2c9a244,0.023429,0.639567,0.337005


In [23]:
sub.to_csv("submission.csv", index=False)