In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional,LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

# Load data from CSV file
data = pd.read_csv('/content/augmented_data (1) - Copy.csv')

# Preprocess data
X = data['Questions'].values
y = data['Blooms Taxonomy'].values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert input text to BERT token IDs and pad sequences
max_len = 128
X_encoded = tokenizer.batch_encode_plus(X, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True)
X_padded = pad_sequences(X_encoded['input_ids'], maxlen=max_len, dtype='long', truncating='post', padding='post')

# Split data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X_padded[:train_size], X_padded[train_size:]
y_train, y_test = y_encoded[:train_size], y_encoded[train_size:]

# Define a custom learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=10000,
    decay_rate=0.96)

# Build and train model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.vocab), output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), metrics=['accuracy'])
model.summary()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 100)          3052200   
                                                                 
 bidirectional (Bidirection  (None, 128, 128)          84480     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 128, 128)          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 38)                1

In [2]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=16)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7981dc555a80>

In [3]:
model.evaluate(X_test,y_test)



[2.2508199214935303, 0.6732348203659058]