In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import warnings
import logging
import pickle
import inspect
import os
import gc



# disabling unnecceseray warnings
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from tensorflow import keras
from keras import layers
import datetime
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split

keras.mixed_precision.set_global_policy("mixed_float16")

# Limit the GPU memory growth using TensorFlow
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
# tf.config.experimental.set_memory_growth(physical_devices[1], True)

import random

# Set random seeds
random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
random.seed(random_seed)
keras.utils.set_random_seed(random_seed)

!pip install autocorrect
from autocorrect import Speller
spell = Speller(lang='en', fast=True)
spell('helo')

In [None]:
data_path = '/kaggle/input/commonlit-evaluate-student-summaries/'

# prompts train
train_pro = pd.read_csv(data_path + 'prompts_train.csv')
train_pro.head(1)

# summaries train
train_sum = pd.read_csv(data_path + 'summaries_train.csv')
train_sum.head(1)

train = train_pro.merge(train_sum , on = "prompt_id")
train.head(1)

# prompts test
test_pro = pd.read_csv(data_path + 'prompts_test.csv')
test_pro.head(1)

# summaries test
test_sum = pd.read_csv(data_path + 'summaries_test.csv')
test_sum.head(1)
test = test_pro.merge(test_sum , on = "prompt_id")
test.head()

In [None]:
# Model name to load
model_name =  "microsoft/deberta-v3-large"

# Load DeBERTa / RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
pre_trained_model = TFAutoModel.from_pretrained(model_name)

In [None]:
# Auto correcting spellings of all summaries
sep = f" {tokenizer.sep_token} "

prefix1 = "Think through this step by step: "
prefix2 = "Pay attention to the content and wording: "

train["text"] = train["text"].apply(lambda x: spell(x))
train['input'] = train['prompt_title'] + sep + prefix1 + train['prompt_question'] + sep + prefix2 + train['text']
train['input'][0]

In [None]:
# Preprocessing features and labels

# Tokenize text data

# Change MAX_SUMMARY_LENGTH
MAX_SUMMARY_LENGTH = 1500 + len(prefix1) + len(prefix2)

# DEBERTA / RoBERTa Tokenizing
X_train = tokenizer.batch_encode_plus(train['input'].tolist(),
                                              add_special_tokens=True,
                                              truncation=True,
                                              padding='max_length',
                                              return_tensors='tf',
                                              max_length=MAX_SUMMARY_LENGTH,
                                              return_attention_mask = True)
del X_train['token_type_ids']

Y_train = tf.constant(train[['content', 'wording']].values, dtype=tf.float32)

In [None]:
# Create head mask
head_mask = np.zeros(X_train['input_ids'].shape)
for i, summary in enumerate(X_train['input_ids'].numpy()):
    use_full = False
    first_sep_flag = True
    for j, token in enumerate(summary):
        if token == tokenizer.sep_token_id:
            if first_sep_flag:
                first_sep_flag = False
            else:
                use_full = not use_full  
        head_mask[i][j] = (1 if use_full else 0) 
head_mask = tf.constant(head_mask)
head_mask

In [None]:
# def get_embeddings(input_ids, attention_mask, model_name):
    
#     # Forward pass through pre trained model
#     outputs = pre_trained_model(input_ids=input_ids, attention_mask=attention_mask)
    
#     if model_name == 'roberta-large':
#         return outputs['pooler_output']
#     else:
#         return outputs[0]

# # Save roberta/deberta embeddings in the training set

# batch_size = 10 # ten is the biggest batch possible (can try maybe 11)
# num_samples = len(X_train['input_ids'])
# num_batches = (num_samples + batch_size - 1) // batch_size
# averaged_embeddings = []

# for i in range(num_batches):
#     start_idx = i * batch_size
#     end_idx = min((i + 1) * batch_size, num_samples)
#     inputs = X_train['input_ids'][start_idx: end_idx]
#     masks = X_train['attention_mask'][start_idx: end_idx]
    
#     embeddings = get_embeddings(input_ids=inputs, attention_mask=masks, model_name=model_name)
#     h_mask = tf.expand_dims(tf.cast(head_mask[start_idx: end_idx], dtype=tf.float32), axis=-1)
#     masked_outputs = tf.multiply(embeddings, h_mask)
#     pooled = (tf.reduce_mean(masked_outputs, axis=1)).numpy()
#     averaged_embeddings.append(pooled)
    
#     if i % int(num_batches * 0.1) == 0:
#         print(f"Batch {i}/{num_batches}")
        
#     del embeddings
#     del masked_outputs
#     del pooled
#     del h_mask
#     gc.collect()
#     tf.keras.backend.clear_session()
# # Write to a file    
# concatenated_embeddings = np.concatenate(averaged_embeddings, axis=0)
# with open('masked_pooled_deberta_embeddings.pkl', 'wb') as f:
#     pickle.dump(concatenated_embeddings, f)

In [None]:
# Save file to output folder

# DeBERTa 
file_path = '/kaggle/input/pooled-deberta-embeddings/pooled_deberta_embeddings.csv'# from input folder

# Load masked pooled Deberta embeddings
with open('/kaggle/input/masked-pooled-deberta-embeddings/mask_pooled_deberta_embeddings.pkl', 'rb') as f:
    loaded_array = pickle.load(f)
# Load embeddings
X_train_preprocessed = pd.read_csv(file_path)


X_train_preprocessed['embeddings'] = X_train_preprocessed['embeddings'].apply(lambda x: list(map(float, x.split(','))))
X_train_preprocessed['masked_embeddings'] = loaded_array.tolist()
# Save a csv file
# df_to_save = df_with_embeddings['pooled_roberta_embedding'].apply(lambda x: ','.join(map(str, x)))
# df_to_save.to_csv(file_path, index=False)

In [None]:
# The loss function
def mcrmse(y_true, y_pred):
    columnwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(tf.sqrt(columnwise_mse), axis=-1)

# log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# # NN with embeddings preprocessed
# def create_model_preprocessed():
#     input_shape = len(X_train_preprocessed['masked_embeddings'][0])

#     input_layer = keras.Input(shape=(input_shape, ), dtype='float32')
    
#     layer_norm = layers.LayerNormalization(name='layer_norm1')(input_layer)
    
#     reshape_input_layer = layers.Reshape((1,input_shape), name='reshape_layer')(layer_norm)
    
#     LSTM_layer = layers.LSTM(512, return_sequences=True, name='LSTM_layer1', activation='linear')(reshape_input_layer)
    
#     layer_norm = layers.LayerNormalization(name='layer_norm2')(LSTM_layer)
    
#     act = layers.Activation(keras.activations.tanh, name='tanh1')(layer_norm)
    
#     LSTM_layer = layers.LSTM(32, return_sequences=False, name='LSTM_layer2', activation='linear',)(act)
    
#     layer_norm = layers.LayerNormalization(name='layer_norm3')(LSTM_layer)
    
#     act = layers.Activation(keras.activations.tanh, name='tanh2')(layer_norm)
    
#     hidden_layer = layers.Dense(16, activation='linear', name='dense_layer')(act)
    
#     dropout = layers.Dropout(0.3, name='dropout_layer')(hidden_layer)
    
#     # batch_norm = layers.BatchNormalization(name='batch_norm')(dropout)

#     output_layer = layers.Dense(2, activation='linear', name='output_layer')(dropout)
    
#     model = keras.Model(inputs=input_layer, outputs=output_layer)

#     for layer in model.layers:
#         layer.trainable = True
        
    
#     opt = keras.optimizers.Adam(learning_rate=lr_schedule)
#     model.compile(loss=mcrmse, optimizer=opt)
    
#     return model

# model = create_model_preprocessed()
# model.summary()

In [None]:
# Train prepocessed model (head only) without validation

# Checkpoint callback
ckptcb = keras.callbacks.ModelCheckpoint(
    "best_model" + ".weights.h5",
    monitor="loss",
    save_best_only=True,
    save_weights_only=True,
    mode="min",
)    

history = model.fit(x=X_train_input,
                    y=Y_train_np,
                    epochs=25,
                    batch_size=4,
                    callbacks=[ckptcb],
                    verbose=2)

In [None]:
# Train prepocessed model (head only) with K folds

X_train_input = np.array(X_train_preprocessed['masked_embeddings'].tolist())
Y_train = tf.constant(train[['content', 'wording']].values, dtype=tf.float32)

# Initialize the KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

# Initialize an empty list to store the validation losses
val_losses = []
histories = []

Y_train_np = Y_train.numpy()

# Iterate over each fold
i = 0
for train_index, val_index in kf.split(X_train_input, Y_train_np):
    
    print(f"Fold {i + 1}")
    i += 1
    
    # Split data into training and validation sets
    X_train_fold, X_val_fold = X_train_input[train_index], X_train_input[val_index]
    Y_train_fold, Y_val_fold = Y_train_np[train_index], Y_train_np[val_index]
    
    # Create and compile your model
    model = create_model()
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the model
    
    # Get the validation loss from the last epoch
    val_loss = min(history.history['val_loss'])
    val_losses.append(val_loss)
    histories.append(history)
    print()

# Calculate the mean validation loss
mean_val_loss = np.mean(val_losses)
print("Mean Validation Loss:", mean_val_loss)

In [None]:
# Plot training and val losses across folds
for i, history in enumerate(histories):
    train_losses = history.history['loss']
    val_losses = history.history['val_loss']
    epochs = range(1, len(train_losses) + 1)

    # Plotting losses
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b', label='Training loss')
    plt.plot(epochs, val_losses, 'r', label='Validation loss')
    plt.title(f'Training and Validation Loss Fold {i + 1}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# build our NN on top of Deberta

# Create a layer that wraps the pre trained model to support Keras library
class PreTrainedModel(keras.Model):
    def __init__(self, pre_trained_model, trainable=False, num_layers_to_freeze=0, name=None, **kwargs):
        super().__init__(name=name, **kwargs)
        self.pre_trained_model = pre_trained_model
        self.trainable = trainable
        self.num_layers_to_freeze = num_layers_to_freeze
        self.pre_trained_model.trainable = self.trainable

        # if equal to -1 freeze all layers
        if self.trainable:
            self.pre_trained_model.trainable = self.trainable
            if self.trainable:
                for layer in self.pre_trained_model.layers[0].encoder.layer[:self.num_layers_to_freeze]:
                    layer.trainable = False

        # Dynamically create properties from pre-trained model
        # for prop_name, prop in inspect.getmembers(self.pre_trained_model):
        #    if not prop_name.startswith('_') and not inspect.ismethod(prop):
        #        setattr(self.__class__, prop_name, prop)

    def call(self, input_ids, attention_mask):
        # Call the pre trained model and get the last hidden state
        output = self.pre_trained_model(input_ids=input_ids, attention_mask=attention_mask)
        return output[0]    

In [None]:
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(0.00015, decay_steps=10000)

# NN with embeddings preprocessed
def create_model_preprocessed():
    input_shape = 1024 

    input_layer = keras.Input(shape=(input_shape, ), dtype='float32')
    
    layer_norm = layers.LayerNormalization(name='layer_norm1')(input_layer)
    
    reshape_input_layer = layers.Reshape((1,input_shape), name='reshape_layer')(layer_norm)
    
    LSTM_layer = layers.LSTM(512, return_sequences=True, name='LSTM_layer1', activation='linear')(reshape_input_layer)
    
    layer_norm = layers.LayerNormalization(name='layer_norm2')(LSTM_layer)
    
    act = layers.Activation(keras.activations.tanh, name='tanh1')(layer_norm)
    
    LSTM_layer = layers.LSTM(32, return_sequences=False, name='LSTM_layer2', activation='linear',)(act)
    
    layer_norm = layers.LayerNormalization(name='layer_norm3')(LSTM_layer)
    
    act = layers.Activation(keras.activations.tanh, name='tanh2')(layer_norm)
    
    hidden_layer = layers.Dense(16, activation='linear', name='dense_layer')(act)
    
    dropout = layers.Dropout(0.3, name='dropout_layer')(hidden_layer)
    
    # batch_norm = layers.BatchNormalization(name='batch_norm')(dropout)

    output_layer = layers.Dense(2, activation='linear', name='output_layer')(dropout)
    
    model = keras.Model(inputs=input_layer, outputs=output_layer)

    for layer in model.layers:
        layer.trainable = True
        
    
    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(loss=mcrmse, optimizer=opt)
    
    return model

model_preprocessed = create_model_preprocessed()
model_preprocessed.load_weights("/kaggle/input/model-weights-1/best_model.weights.h5")
model_preprocessed.summary()

In [None]:
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(0.00015, decay_steps=10000)

# NN with embeddings preprocessed
def create_model(input_shape=(1575,), embeddings_len=1024):
    pre_trained_model_instance = PreTrainedModel(pre_trained_model, name="deberta_layer")

    # Input layers
    input_ids = keras.Input(shape=input_shape, dtype='int32', name='input_ids')
    attention_mask = keras.Input(shape=input_shape, dtype='int32', name='attention_mask')
    head_mask = keras.Input(shape=input_shape, dtype='float32', name='head_mask')
    
    # Create embeddings and mask pool them
    deberta = pre_trained_model_instance(input_ids, attention_mask)
    h_mask = layers.Lambda(lambda x: tf.expand_dims(tf.cast(x, dtype=tf.float32), axis=-1), name='expand_dims')(head_mask)
    masked_outputs = layers.Lambda(lambda x: tf.multiply(x[0], x[1]), output_shape=(1575, 1024,), name='masked_embeddings')([deberta, h_mask])
    avg_pooling = layers.GlobalAveragePooling1D()(masked_outputs)
    
    # Head of NN
    layer_norm = layers.LayerNormalization(name='layer_norm1')(avg_pooling)

    reshape_input_layer = layers.Reshape((1, embeddings_len), name='reshape_layer')(layer_norm)
    
    LSTM_layer = layers.LSTM(512, return_sequences=True, name='LSTM_layer1', activation='linear')(reshape_input_layer)
    
    layer_norm = layers.LayerNormalization(name='layer_norm2')(LSTM_layer)
    
    act = layers.Activation(keras.activations.tanh, name='tanh1')(layer_norm)
    
    LSTM_layer = layers.LSTM(32, return_sequences=False, name='LSTM_layer2', activation='linear',)(act)
    
    layer_norm = layers.LayerNormalization(name='layer_norm3')(LSTM_layer)
    
    act = layers.Activation(keras.activations.tanh, name='tanh2')(layer_norm)
    
    hidden_layer = layers.Dense(16, activation='linear', name='dense_layer')(act)
    
    dropout = layers.Dropout(0.3, name='dropout_layer')(hidden_layer)

    output_layer = layers.Dense(2, activation='linear', name='output_layer')(dropout)
    
    model = keras.Model(inputs=[input_ids, attention_mask, head_mask], outputs=output_layer)

    for layer in model.layers:
        layer.trainable = True
    
    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(loss=mcrmse, optimizer=opt)
    
    return model, pre_trained_model_instance

model, deberta_model = create_model()
model.summary()
# deberta_model.summary()

In [None]:
# Transferring weights 
for layer in model.layers:
    origin_name = layer.name
    new_name = f"{layer.name}_full"
    layer.name = new_name    # Change the internal name attribute
    
    if any(origin_name == preprocessed_layer.name for preprocessed_layer in model_preprocessed.layers):
        layer.set_weights(model_preprocessed.get_layer(name=origin_name).get_weights())
        layer.trainable = True
#model.summary() 
#model_preprocessed.summary()


In [None]:
Y_train = tf.constant(train[['content', 'wording']].values, dtype=tf.float32)
Y_train_np = Y_train.numpy()

# Checkpoint callback
ckptcb = keras.callbacks.ModelCheckpoint(
    "best_model" + ".weights.h5",
    monitor="loss",
    save_best_only=True,
    save_weights_only=True,
    mode="min",
)    

history = model.fit(x=[X_train['input_ids'], X_train['attention_mask'], head_mask],
                    y=Y_train_np,
                    epochs=6,
                    batch_size=4,
                    callbacks=[ckptcb],
                    verbose=1)


In [None]:
# Ensure the os module is imported
file_path = "/kaggle/working/no_augmentation_model.weights.h5"

# Check if the file exists and remove it
if os.path.exists(file_path):
    os.remove(file_path)
    print("File deleted successfully.")
else:
    print("File not found.")

# Save the model's weights using the HDF5 format directly
model.save_weights(file_path)
print("Model saved successfully.")

In [None]:
test["text"] = test["text"].apply(lambda x: spell(x))
test['input'] = test['prompt_title'] + sep + prefix1 + test['prompt_question'] + sep + prefix2 + test['text']
test['input'][0]

# Preprocessing features and labels
X_test = tokenizer.batch_encode_plus(test['input'].tolist(),
                                              add_special_tokens=True,
                                              truncation=True,
                                              padding='max_length',
                                              return_tensors='tf',
                                              max_length=MAX_SUMMARY_LENGTH,
                                              return_attention_mask = True)
del X_test['token_type_ids']

# Create head mask
head_mask_test = np.zeros(X_test['input_ids'].shape)
for i, summary in enumerate(X_test['input_ids'].numpy()):
    use_full = False
    first_sep_flag = True
    for j, token in enumerate(summary):
        if token == tokenizer.sep_token_id:
            if first_sep_flag:
                first_sep_flag = False
            else:
                use_full = not use_full
        head_mask_test[i][j] = (1 if use_full else 0) 
head_mask_test = tf.constant(head_mask_test)

test_data = {
    'input_ids': X_test['input_ids'],
    'attention_mask': X_test['attention_mask'],
    'head_mask': head_mask_test,
    'student_id': test['student_id'],
}
def generate_predictions(model, test_data):
    contents = []
    wordings = []
    ids = []
    predictions = model.predict(x=[test_data['input_ids'], test_data['attention_mask'], test_data['head_mask']],
                                batch_size=4)

    for idx, output in enumerate(predictions):
        # Assuming the first index corresponds to the content prediction
        contents.append(output[0])
        # Assuming the second index corresponds to the wording prediction
        wordings.append(output[1])
        ids.append(test_data['student_id'][idx])  # Assuming you have some kind of IDs for test samples
    return ids, contents, wordings

ids, contents, wordings = generate_predictions(model, test_data)

submission_df = pd.DataFrame({'student_id': ids,
                              'content': contents,
                              'wording': wordings})

submission_df.to_csv("submission.csv", index=False)
submission_df.head()