# Imports

In [1]:
# General imports
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import warnings
import datetime
import logging
import pickle
import random
import shutil
import os
import gc
import math
import spacy
import re
import string

# Neural network imports
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.callbacks import EarlyStopping
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import GroupKFold
!pip install /kaggle/input/autocorrect/autocorrect-2.6.1.tar
from autocorrect import Speller

# Lgbm imports
import lightgbm as lgb
from lightgbm import LGBMRegressor, log_evaluation, early_stopping
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import mean_squared_error
from typing import List
from tqdm import tqdm
import json
import nltk
from nltk.corpus import stopwords
from collections import Counter
# !pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
# from spellchecker import SpellChecker
# spellchecker = SpellChecker()
tqdm.pandas()

# disabling unnecceseray warnings
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

keras.mixed_precision.set_global_policy("mixed_float16")
# Set random seeds
spell = Speller(lang='en', fast=True)

2024-07-01 12:30:50.087868: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 12:30:50.088168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 12:30:50.239134: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  pid, fd = os.forkpty()


Processing /kaggle/input/autocorrect/autocorrect-2.6.1.tar
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622364 sha256=57e3eed19da61a834fedbae8c52a7f52fd863fe76c61dfcaad26bfb3371ed3f4
  Stored in directory: /root/.cache/pip/wheels/db/69/42/0fb0421d2fe70d195a04665edc760cfe5fd341d7bb8d8e0aaa
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [16]:
class CFG:
    pre_trained_model_name="/kaggle/input/deberta-v3-large/deberta_v3_large/"
    final_model_path = '/full_model_scaled-6.keras'
    learning_rate=0.00015
    weight_decay=1e-4
    warmup_steps=100
    hidden_dropout_prob=0.
    attention_probs_dropout_prob=0.
    epochs=6
    n_splits=4
    batch_size=4
    random_seed=42
    max_length=1575
    embeddings_len=1024

In [4]:
def seed_everything(random_seed):
    
    os.environ['PYTHONHASHSEED'] = str(random_seed)
    np.random.seed(random_seed)
    tf.random.set_seed(random_seed)
    random.seed(random_seed)
    keras.utils.set_random_seed(random_seed)
    
seed_everything(random_seed=CFG.random_seed)

In [5]:
def move_to_working_folder(source_path, destination_path):
    shutil.copy(source_path, destination_path)

# Load Data

In [6]:
data_path = '/kaggle/input/commonlit-evaluate-student-summaries/'

# prompts train
train_pro = pd.read_csv(data_path + 'prompts_train.csv')

# summaries train
train_sum = pd.read_csv(data_path + 'summaries_train.csv')
train = train_pro.merge(train_sum , on = "prompt_id")

# prompts test
test_pro = pd.read_csv(data_path + 'prompts_test.csv')

# summaries test
test_sum = pd.read_csv(data_path + 'summaries_test.csv')
test = test_pro.merge(test_sum , on = "prompt_id")
test.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3
2,def789,Summarize...,Example Title 2,Heading\nText...,111111eeeeee,Example text 2
3,def789,Summarize...,Example Title 2,Heading\nText...,333333dddddd,Example text 4


# Data Exploration

In [7]:
# TODO

# train['content'].hist(bins=20)
# train.boxplot('content')
# train['wording'].hist(bins=20)
# train.boxplot('wording')

# Preprocessing (Full Model)

In [8]:
prefix1 = "Think through this step by step: "
prefix2 = "Pay attention to the content and wording: "

# This function creates input ids, attention mask, and head mask
def preprocess(summary, prompt_question, prompt_text, tokenizer):
    
    sep = f" {tokenizer.sep_token} " 
    summary = prefix1 + prompt_question + sep + prefix2 + summary.apply(spell) + sep + prompt_text
    tokenized = tokenizer.batch_encode_plus(summary.tolist(),
                                            add_special_tokens=False,
                                            truncation=True,
                                            padding='max_length',
                                            return_tensors='tf',
                                            max_length=CFG.max_length,
                                            return_attention_mask=True)
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']

    # Create head mask
    head_mask = np.zeros(input_ids.shape)
    for i, summ in enumerate(input_ids.numpy()):
        use_full = False
        for j, token in enumerate(summ):
            if token == tokenizer.sep_token_id:
                use_full = not use_full  
            elif token == tokenizer.pad_token_id:
                break
            head_mask[i][j] = (1. if use_full else 0.) 
    
    return [input_ids.numpy(), attention_mask.numpy(), head_mask]

In [9]:
# # Check head mask of first prompt (for debug)
# # Run PreTrainedModel first

# model = PreTrainedModel('/kaggle/input/deberta-v3-large/deberta_v3_large/')
# sep = f" {model.tokenizer.sep_token} " 
# ids, mask, head = tokenize(train['text'], train['prompt_question'], train['prompt_text'], model.tokenizer)
# train['input'] = prefix1 + train['prompt_question'] + sep + prefix2 + train['text'] # + sep + train['prompt_text']
# first = model.tokenizer.tokenize(train['input'][500],                                               
#           add_special_tokens=False,
#           truncation=True,
#           padding='max_length',
#           return_tensors='tf',
#           max_length=MAX_SUMMARY_LENGTH,
#           return_attention_mask=False)

# def find_indexes(array):
#     return [index for index, value in enumerate(array) if value == 1]
# np.array(first)[find_indexes(head[500])]

# Define Model

In [10]:
# Creates a model that wraps the pre trained model
@keras.utils.register_keras_serializable()
class PreTrainedModel(keras.Model):
    def __init__(self, model_path, trainable=False, num_layers_to_freeze=0, name=None, **kwargs):
        super().__init__(name=name, **kwargs)
        self.model_path = model_path
        self.trainable = trainable
        self.num_layers_to_freeze = num_layers_to_freeze
        
        # Load model and tokenizer
        self.model = TFAutoModel.from_pretrained(model_path + "model") 
        self.tokenizer = AutoTokenizer.from_pretrained(model_path + "tokenizer")
        
        # Define model configurations
        self.model.trainable = self.trainable
        self.model.config.hidden_dropout_prob = CFG.hidden_dropout_prob
        self.model.config.attention_probs_dropout_prob = CFG.attention_probs_dropout_prob
        
        # Freeze layers if trainable
        if self.trainable:
            self.model.trainable = self.trainable
            if self.trainable:
                for layer in self.model.layers[0].encoder.layer[:self.num_layers_to_freeze]:
                    layer.trainable = False

    # Call the pre trained model and get the all hidden state
    def call(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        return output.hidden_states
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'model_path': self.model_path,
            'trainable': self.trainable,
            'num_layers_to_freeze': self.num_layers_to_freeze
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [11]:
# Define layers for head mask step

@keras.utils.register_keras_serializable()
class ExpandDimsLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(ExpandDimsLayer, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.expand_dims(tf.cast(inputs, dtype=tf.float32), axis=-1)

@keras.utils.register_keras_serializable()
class MaskedEmbeddingsLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(MaskedEmbeddingsLayer, self).__init__(**kwargs)

    def call(self, inputs):
        hidden_state, h_mask = inputs
        return tf.multiply(hidden_state, h_mask)

## Define loss function

In [12]:
# The loss function
@keras.utils.register_keras_serializable()
def mcrmse(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float16)
    y_pred = tf.cast(y_pred, tf.float16)
    columnwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=0)
    return tf.reduce_mean(tf.sqrt(columnwise_mse), axis=-1)

### (Optional) Add image of model design diagram

In [13]:
def build_deberta():
    return PreTrainedModel(CFG.pre_trained_model_name, name="deberta_layer")

In [14]:
def create_model(decay_steps=10000):
    
    # Instances
    pre_trained_model_instance = build_deberta()
    expand_dims_instance = ExpandDimsLayer(name='expand_dims')
    mask_instance = MaskedEmbeddingsLayer(name='masked_embeddings')
    avg_pooling_instance = layers.GlobalAveragePooling1D()
    reshape_instance1 = layers.Reshape((1, -1), name='reshape_layer1')
    reshape_instance2 = layers.Reshape((1, -1), name='reshape_layer2')
    dense_instance = layers.Dense(embeddings_len, activation='gelu')

    # The NN starts from here
    
    # Input layers
    input_ids = keras.Input(shape=(CFG.max_length,), dtype='int32', name='input_ids')
    attention_mask = keras.Input(shape=(CFG.max_length,), dtype='int32', name='attention_mask')
    head_mask = keras.Input(shape=(CFG.max_length,), dtype='float32', name='head_mask')
    
    # Create embeddings and get all hidden states
    hidden_states = pre_trained_model_instance(input_ids, attention_mask)
    
    # Mask pooling all hidden states of pre-trained model
    pooled_hidden_states = []
    for hidden_state in hidden_states:
        h_mask = expand_dims_instance(head_mask)
        masked_outputs = mask_instance([hidden_state, h_mask])
        avg_pooling_layer = avg_pooling_instance(masked_outputs)
        reshape_layer = reshape_instance1(avg_pooling_layer)
        pooled_hidden_states.append(reshape_layer)
    
    # Concatenate all the hidden states an forward pass through LSTM
    x = layers.Concatenate(axis=1)(pooled_hidden_states)
    x = layers.LSTM(CFG.embeddings_len, return_sequences=False)(x)
    
    # Multi-sample Dropout
    x = layers.Dropout(0.1)(x)
    dropoutList = [reshape_instance2(dense_instance(layers.Dropout((i + 1) * 0.1)(x))) for i in range(5)]
    x = layers.Concatenate(axis=1)(dropoutList)
    x = layers.GlobalAveragePooling1D()(x)
    
    # Final dense layer
    x = layers.Dense(512, activation='linear')(x)
    x = layers.LayerNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Activation(keras.activations.gelu, name='gelu')(x)
    
    output_layer = layers.Dense(2, activation='linear')(x)
    

    # Compile model
    model = keras.Model(inputs=[input_ids, attention_mask, head_mask], outputs=output_layer)
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate=CFG.learning_rate,
                                                            decay_steps=decay_steps,
                                                            warmup_target=CFG.learning_rate,
                                                            warmup_steps=CFG.warmup_steps,
                                                           )
    opt = keras.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=CFG.weight_decay, use_ema=True)
    model.compile(loss=mcrmse, optimizer=opt)
    return model, pre_trained_model_instance

# Train Full Model

In [20]:
# Choose Training type
TRAIN_WITH_FOLDS = False
move_to_working_folder('/kaggle/input/models/' + CFG.final_model_path, '/kaggle/working/' + CFG.final_model_path)

In [None]:
X = train[['text', 'prompt_question', 'prompt_text']]    

# apply transformations to the content and wording
train['content_transformed'] = (train['content'] + 3).apply(np.log)
train['wording_transformed'] = (train['wording'] + 3).apply(np.log)
y = train[['content_transformed', 'wording_transformed']].astype('float16')

if TRAIN_WITH_FOLDS:
    # Train full model with GroupKFolds
    gkf = GroupKFold(n_splits=4)
    folds = gkf.split(X, y, groups=train['prompt_id'])

    val_losses = []
    histories = []

    for i, (train_index, val_index) in enumerate(folds):
        print(f"Fold {i}")
        
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        decay_steps = math.ceil((len(X_train_fold) / CFG.batch_size) * CFG.epochs) 
        model, deberta = create_model(decay_steps=decay_steps)

        X_train_fold = preprocess(X_train_fold['text'], X_train_fold['prompt_question'], X_train_fold['prompt_text'], deberta.tokenizer)
        X_val_fold = preprocess(X_val_fold['text'], X_val_fold['prompt_question'], X_val_fold['prompt_text'], deberta.tokenizer)
        
        # Callbacks
        early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
        ema = keras.callbacks.SwapEMAWeights(swap_on_epoch=True)
        ckptcb = keras.callbacks.ModelCheckpoint(
            f"full_model_fold_{i}" + ".weights.h5",
            monitor="val_loss",
            save_best_only=True,
            save_weights_only=True,
            mode="min",
        ) 

        history = model.fit(x=X_train_fold,
                            y=y_train_fold.values,
                            validation_data=(X_val_fold, y_val_fold.values),
                            epochs=epochs,
                            batch_size=batch_size,
                            callbacks=[early_stopping, ema, ckptcb],
                            verbose=1)

        # Get the validation loss from the last epoch
        val_loss = min(history.history['val_loss'])
        val_losses.append(val_loss)
        histories.append(history)
        print()

    # Calculate the mean validation loss
    mean_val_loss = np.mean(val_losses)
    print("Mean Validation Loss:", mean_val_loss)
    
else:
    # Train full model no folds all data
    decay_steps = math.ceil((len(X) /  CFG.batch_size) * CFG.epochs) 
    path = '/kaggle/working/full_model_scaled-6.keras'
    model = keras.models.load_model(path)
    deberta = build_deberta()
    
    X = preprocess(X['text'], X['prompt_question'], X['prompt_text'], deberta.tokenizer)
    
    # Callbacks
    ema = keras.callbacks.SwapEMAWeights(swap_on_epoch=True)
    ckptcb = keras.callbacks.ModelCheckpoint(
        "full_model_scaled" + ".keras",
        monitor="loss",
        save_best_only=True,
        mode="min",
    )
    
    history = model.fit(x=X,
                        y=y.values,
                        epochs=CFG.epochs,
                        batch_size=CFG.batch_size,
                        callbacks=[ema, ckptcb],
                        verbose=1)
    
print('done')

Epoch 1/6
[1m   1/1792[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m116:04:54[0m 233s/step - loss: 0.1655

# Inference 

In [22]:
def generate_predictions(model, data):
    contents = []
    wordings = []
    ids = []
    predictions = model.predict(x=[data['input_ids'], data['attention_mask'], data['head_mask']],
                                batch_size=CFG.batch_size)

    for idx, output in enumerate(predictions):
        contents.append(output[0])
        wordings.append(output[1])
        ids.append(data['student_id'][idx])

    contents = np.exp(contents) - 3
    wordings = np.exp(wordings) - 3
        
    return ids, contents, wordings

## Inference Baseline Model

In [11]:
# content_scores = np.random.uniform(-1.73, 3.9, len(test))
# wording_scores = np.random.uniform(-1.96, 4.31, len(test))

# submission_df = pd.DataFrame({'student_id': test['student_id'],
#                               'content': content_scores,
#                               'wording': wording_scores})

# submission_df.to_csv("submission.csv", index=False)
# submission_df.head()

## Inference Final Model

In [23]:
model_to_submit_path = '/kaggle/working/' + CFG.final_model_path
move_to_working_folder('/kaggle/input/models/' + CFG.final_model_path, model_to_submit_path)

# Sort by prompt and text lengths
test['length'] = test['text'].apply(len) + test['prompt_text'].apply(len)
test = test.sort_values('length', ascending=True).reset_index(drop=True)


X = test[['text', 'prompt_question', 'prompt_text']]

model = keras.models.load_model(model_to_submit_path)
deberta = build_deberta()
    
X = preprocess(X['text'], X['prompt_question'], X['prompt_text'], deberta.tokenizer)

test_data = {
    'input_ids': X[0],
    'attention_mask': X[1],
    'head_mask': X[2],
    'student_id': test['student_id'],
}

model = keras.models.load_model(model_to_submit_path)
ids, contents, wordings = generate_predictions(model, test_data)
    
    
submission_df = pd.DataFrame({'student_id': ids,
                              'content': contents,
                              'wording': wordings})

submission_df.to_csv("submission.csv", index=False)
submission_df.head()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 143s/step


Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.932617,-1.810547
1,222222cccccc,-1.935547,-1.8125
2,111111eeeeee,-1.9375,-1.821289
3,333333dddddd,-1.938477,-1.819336


# Leftovers

In [None]:
# **Results**

# REAL RESULTS

# baseline with grouped KFolds: 0.5310284495353699
# From now on everything will be with grouped KFolds
# EMA: 0.5303183048963547
# EMA + lr_decay_steps=100000 + LSTM1=128 + LSTM2=64 + Dropout_after_LSTM2=0.6: 0.5258100032806396
# EMA + lr_decay_steps=100000 + LSTM1=128 + LSTM2=32 + Dropout_after_LSTM2=0.4: 0.5223950520157814

### Augmentation Manipulation

In [None]:
# import nltk
# from nltk.corpus import wordnet, stopwords
# from googletrans import Translator
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# nltk.download('wordnet')
# nltk.download('stopwords')

# # Initialize the paraphrasing model
# def synonym_replacement(words, n):
#     new_words = words.copy() 
#     random_word_list = list(set([word for word in words if word not in stopwords.words('english')]))
#     random.shuffle(random_word_list)
#     num_replaced = 0
#     for random_word in random_word_list:
#         synonyms = get_synonyms(random_word)
#         if len(synonyms) >= 1:
#             synonym = random.choice(synonyms)
#             new_words = [synonym if word == random_word else word for word in new_words]
#             num_replaced += 1
#         if num_replaced >= n: 
#             break
#     return new_words

# def get_synonyms(word):
#     synonyms = set()
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             synonyms.add(lemma.name())
#     if word in synonyms:
#         synonyms.remove(word)
#     return list(synonyms)

# def back_translation(text, src_lang='en', tgt_lang='fr'):
#     try:
#         translator = Translator()
#         translated = translator.translate(text, src=src_lang, dest=tgt_lang).text
#         back_translated = translator.translate(translated, src=tgt_lang, dest=src_lang).text
#     except:
#         print(f"Error during back translation: {e}")
#         return text  # Return the original text in case of any error
        
#     return back_translated

# def augment_text(text):
#     augmentation_choice = random.choice(['synonym_replacement', 'back_translation'])
#     words = text.split()
    
#     if augmentation_choice == 'synonym_replacement':
#         augmented_text = ' '.join(synonym_replacement(words, n=250))
#     elif augmentation_choice == 'back_translation':
#         augmented_text = back_translation(text)
#     elif augmentation_choice == 'paraphrase':
#         augmented_text = paraphrase(text)
    
#     return augmented_text

### Meta Psuedo Labels Training Loop

In [None]:
# # loading augmented data
# augmented_data = pd.read_excel('/kaggle/input/llm-generate-test/LLM_Generate_Test.xlsx')
# augmented_data.columns = ['student_id', 'prompt_text', 'prompt_question', 'text']

# def generate_predictions(model, data):
#     contents = []
#     wordings = []
#     ids = []
#     predictions = model.predict(x=[data['input_ids'], data['attention_mask'], data['head_mask']],
#                                 batch_size=4)

#     for idx, output in enumerate(predictions):
#         contents.append(output[0])
#         wordings.append(output[1])
#         ids.append(data['student_id'][idx])
#     return ids, contents, wordings

# ROUNDS = 1

# SAVED_DATASETS_INDEXES = []
# SAVED_WEIGHTS_INDEXES = [0]

# # The initial model
# model, deberta = create_model()

# if SAVED_WEIGHTS_INDEXES[-1] == 0:
#     # Load weights of initial teacher
#     model.load_weights('/kaggle/input/no-aug-full-model-4epochs/no_augment_full_model_4epochs..weights.h5')
# else:
#     # Load weights of last meta round
#     print(f"Loading weights: meta_model_{SAVED_WEIGHTS_INDEXES[-1]}")
#     model.load_weights(f'/kaggle/input/meta-model-weights/meta_model_{SAVED_WEIGHTS_INDEXES[-1]}.weights.h5')


# # Tokenizing train data
# X = tokenize(train['text'], train['prompt_question'], train['prompt_text'], deberta.tokenizer)
# y = train[['content', 'wording']].astype('float16')

# # Tokenizing augmented data
# X_aug = tokenize(augmented_data['text'], augmented_data['prompt_quesetion'], augmented_data['prompt_text'], deberta.tokenizer)

# aug_input = {
#     'input_ids': X_aug[0],
#     'attention_mask': X_aug[1],
#     'head_mask': X_aug[2],
#     'student_id': augmented_data['student_id']
# }

# for i in range(ROUNDS):
            
#     print(f"Round {i}/{ROUNDS}")
    
#     # Skip rounds we already done
#     if i < SAVED_WEIGHTS_INDEXES[-1]:
#         continue
    
#     # Predict meta psuedo labels
    
#     # Check if labeled augmentations ar already created
#     if i in SAVED_DATASETS_INDEXES:
#         # predictions already generated
#         augmented_labeled_data = pd.read_csv(f'/kaggle/input/augmented-labeled-data/augmented_labeled_data_round_{i}.csv')
#     else:
#         print()
#         print(f"Generateing predictions...")
#         ids, contents, wordings = generate_predictions(model, aug_input)
#         augmented_data['content'] = contents
#         augmented_data['wording'] = wordings
#         augmented_data.to_csv(f"augmented_labeled_data_round_{i}.csv")
#         augmented_labeled_data = augmented_data
        
#     # update the labels    
#     y_aug = augmented_labeled_data[['content', 'wording']].astype('float16')

#     # checkpoint callback
#     ckptcb = keras.callbacks.ModelCheckpoint(
#         f"meta_model_{i}" + ".weights.h5",
#         monitor="loss",
#         save_best_only=True,
#         save_weights_only=True,
#         mode="min",
#     )
    
#     print()
#     print(f"Training on unlabeled data...")
#     model.fit(x=X_aug,
#               y=y_aug.values,
#               epochs=2,
#               batch_size=batch_size,
#               validation_data=(X, y.values),
#               verbose=1)
    
#     # Fine tune the pre-trained model only on the labeled data
#     print()
#     print(f"Training on labeled data...")
#     model.fit(x=X,
#               y=y.values,
#               epochs=2,
#               batch_size=batch_size,
#               callbacks=[ckptcb],
#               verbose=1)
#     print()

### LGBM + Feature Engineering (+6000)

In [None]:
# DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

# prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
# prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
# summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
# summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
# sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")


# class Preprocessor:
#     def __init__(self, model_name: str,) -> None:
#         self.STOP_WORDS = set(stopwords.words('english'))

#         self.tokenizer = AutoTokenizer.from_pretrained(model_name + "tokenizer")
#         self.spacy_ner_model = spacy.load('en_core_web_sm',)
#         self.speller = SpellChecker() #Speller(lang='en')
        
#     def count_text_length(self, df: pd.DataFrame, col:str) -> pd.Series:
#         """ text length """
#         tokenizer=self.tokenizer
#         return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

#     def word_overlap_count(self, row):
#         """ intersection(prompt_text, text) """        
#         def check_is_stop_word(word):
#             return word in self.STOP_WORDS
        
#         prompt_words = row['prompt_tokens']
#         summary_words = row['summary_tokens']
#         if self.STOP_WORDS:
#             prompt_words = list(filter(check_is_stop_word, prompt_words))
#             summary_words = list(filter(check_is_stop_word, summary_words))
#         return len(set(prompt_words).intersection(set(summary_words)))
            
#     def ngrams(self, token, n):
#         # Use the zip function to help us generate n-grams
#         # Concatentate the tokens into ngrams and return
#         ngrams = zip(*[token[i:] for i in range(n)])
#         return [" ".join(ngram) for ngram in ngrams]

#     def ngram_co_occurrence(self, row, n: int):
#         # Tokenize the original text and summary into words
#         original_tokens = row['prompt_tokens']
#         summary_tokens = row['summary_tokens']

#         # Generate n-grams for the original text and summary
#         original_ngrams = set(self.ngrams(original_tokens, n))
#         summary_ngrams = set(self.ngrams(summary_tokens, n))

#         # Calculate the number of common n-grams
#         common_ngrams = original_ngrams.intersection(summary_ngrams)

#         # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
#         # original_ngram_freq = Counter(ngrams(original_words, n))
#         # summary_ngram_freq = Counter(ngrams(summary_words, n))
#         # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

#         return len(common_ngrams)
    
#     def ner_overlap_count(self, row, mode:str):
#         model = self.spacy_ner_model
#         def clean_ners(ner_list):
#             return set([(ner[0].lower(), ner[1]) for ner in ner_list])
#         prompt = model(row['prompt_text'])
#         summary = model(row['text'])

#         if "spacy" in str(model):
#             prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
#             summary_ner = set([(token.text, token.label_) for token in summary.ents])
#         elif "stanza" in str(model):
#             prompt_ner = set([(token.text, token.type) for token in prompt.ents])
#             summary_ner = set([(token.text, token.type) for token in summary.ents])
#         else:
#             raise Exception("Model not supported")

#         prompt_ner = clean_ners(prompt_ner)
#         summary_ner = clean_ners(summary_ner)

#         intersecting_ners = prompt_ner.intersection(summary_ner)
        
#         ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
#         if mode == "train":
#             return ner_dict
#         elif mode == "test":
#             return {key: ner_dict.get(key) for key in self.ner_keys}

    
#     def quotes_count(self, row):
#         summary = row['text']
#         text = row['prompt_text']
#         quotes_from_summary = re.findall(r'"([^"]*)"', summary)
#         if len(quotes_from_summary)>0:
#             return [quote in text for quote in quotes_from_summary].count(True)
#         else:
#             return 0

#     def spelling(self, text):
        
#         wordlist=text.split()
#         amount_miss = len(list(self.speller.unknown(wordlist)))

#         return amount_miss
    
#     def run(self, 
#             prompts: pd.DataFrame,
#             summaries:pd.DataFrame,
#             mode:str
#         ) -> pd.DataFrame:
        
#         # before merge preprocess
#         prompts["prompt_length"] = prompts["prompt_text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )
#         prompts["prompt_tokens"] = prompts["prompt_text"].apply(
#             lambda x: self.tokenizer.convert_ids_to_tokens(
#                 self.tokenizer.encode(x), 
#                 skip_special_tokens=True
#             )
#         )

#         summaries["summary_length"] = summaries["text"].apply(
#             lambda x: len(self.tokenizer.encode(x))
#         )
#         summaries["summary_tokens"] = summaries["text"].apply(
#             lambda x: self.tokenizer.convert_ids_to_tokens(
#                 self.tokenizer.encode(x), 
#                 skip_special_tokens=True
#             )

#         )
#         summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)

#         # merge prompts and summaries
#         input_df = summaries.merge(prompts, how="left", on="prompt_id")

#         # after merge preprocess
#         input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
#         input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
#         input_df['bigram_overlap_count'] = input_df.progress_apply(
#             self.ngram_co_occurrence,args=(2,), axis=1 
#         )
#         input_df['trigram_overlap_count'] = input_df.progress_apply(
#             self.ngram_co_occurrence, args=(3,), axis=1
#         )
        
#         input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
#         return input_df.drop(columns=["summary_tokens", "prompt_tokens"])


# model_path = '/kaggle/input/deberta-v3-large/deberta_v3_large/'
# preprocessor = Preprocessor(model_name=model_path)

# features_train = preprocessor.run(prompts_train, summaries_train, mode="train")
# features_test = preprocessor.run(prompts_test, summaries_test, mode="test")
# features_test['length'] = features_test['summary_length'] + features_test['prompt_length']
# features_test = features_test.sort_values('length', ascending=True).reset_index(drop=True)
# features_train.head()


# data_path = '/kaggle/input/commonlit-evaluate-student-summaries/'
# # prompts train
# train_pro_polars = pl.read_csv(data_path + 'prompts_train.csv')
# train_pro_polars.head(1)

# # summaries train
# train_sum_polars = pl.read_csv(data_path + 'summaries_train.csv')
# train_sum_polars.head(1)

# train_polars = train_pro_polars.join(train_sum_polars , on="prompt_id", how='inner')
# train_polars.head(1)

# # prompts train
# test_pro_polars = pl.read_csv(data_path + 'prompts_test.csv')
# test_pro_polars.head(1)

# # summaries train
# test_sum_polars = pl.read_csv(data_path + 'summaries_test.csv')
# test_sum_polars.head(1)

# test_polars = test_pro_polars.join(test_sum_polars , on="prompt_id", how='inner')
# test_polars.head(1)

# columns = [  
#     (
#         pl.col("text").str.split(by="\n\n").alias("paragraph")
#     ),
# ]
# # Load training and testing sets, while using \ n \ n character segmentation to list and renaming to paragraph for text data
# train_polars = train_polars.with_columns(columns)
# test_polars = test_polars.with_columns(columns)
# # train_polars.head(1)

# nlp = spacy.load("en_core_web_sm")
# with open('/kaggle/input/english-word-hx/words.txt', 'r') as file:
#     english_vocab = set(word.strip().lower() for word in file)



# # Feature Engineering

# paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
# paragraph_fea2 = ['paragraph_error_num'] + paragraph_fea
# sentence_fea = ['sentence_len','sentence_word_cnt']

# def count_spelling_errors(text):
#     doc = nlp(text)
#     lemmatized_tokens = [token.lemma_.lower() for token in doc]
#     spelling_errors = sum(1 for token in lemmatized_tokens if token not in english_vocab)
#     return spelling_errors

# def removeHTML(x):
#     html=re.compile(r'<.*?>')
#     return html.sub(r'',x)

# def dataPreprocessing(x):
#     # Convert words to lowercase
#     x = x.lower()
#     # Remove HTML
#     x = removeHTML(x)
#     # Delete strings starting with @
#     x = re.sub("@\w+", '',x)
#     # Delete Numbers
#     x = re.sub("'\d+", '',x)
#     x = re.sub("\d+", '',x)
#     # Delete URL
#     x = re.sub("http\w+", '',x)
#     # Replace consecutive empty spaces with a single space character
#     x = re.sub(r"\s+", " ", x)
#     # Replace consecutive commas and periods with one comma and period character
#     x = re.sub(r"\.+", ".", x)
#     x = re.sub(r"\,+", ",", x)
#     # Remove empty characters at the beginning and end
#     x = x.strip()
#     return x


# # paragraph features
# def remove_punctuation(text):
#     """
#     Remove all punctuation from the input text.
    
#     Args:
#     - text (str): The input text.
    
#     Returns:
#     - str: The text with punctuation removed.
#     """
#     # string.punctuation
#     translator = str.maketrans('', '', string.punctuation)
#     return text.translate(translator)


# def Paragraph_Preprocess(tmp):
#     # Expand the paragraph list into several lines of data
#     tmp = tmp.explode('paragraph')
#     # Paragraph preprocessing
#     tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
#     tmp = tmp.with_columns(pl.col('paragraph').map_elements(remove_punctuation).alias('paragraph_no_pinctuation'))
#     tmp = tmp.with_columns(pl.col('paragraph_no_pinctuation').map_elements(count_spelling_errors).alias("paragraph_error_num"))
#     # Calculate the length of each paragraph
#     tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
#     # Calculate the number of sentences and words in each paragraph
#     tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
#                     pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
#     return tmp

# def Paragraph_Eng(train_tmp):
#     num_list = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600]
#     num_list2 = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700]
#     aggs = [
#         # Count the number of paragraph lengths greater than and less than the i-value
#         *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt") for i in [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700] ], 
#         *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt") for i in [25,49]], 
#         # other
#         *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea2],
#         *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea2],
#         *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea2],
#         *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in paragraph_fea2],
#         *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea2],
#         *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea2],
#         *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in paragraph_fea2],
#         *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in paragraph_fea2],  
#         *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in paragraph_fea2],  
#         ]
    
#     df = train_tmp.group_by(['student_id'], maintain_order=True).agg(aggs).sort("student_id")
#     df = df.to_pandas()
#     return df

# # sentence feature
# def Sentence_Preprocess(tmp):
#     # Preprocess text and use periods to segment sentences in the text
#     tmp = tmp.with_columns(pl.col('text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
#     tmp = tmp.explode('sentence')
#     # Calculate the length of a sentence
#     tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
#     # Filter out the portion of data with a sentence length greater than 15
#     tmp = tmp.filter(pl.col('sentence_len')>=15)
#     # Count the number of words in each sentence
#     tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    
#     return tmp


# # feature_eng
# def Sentence_Eng(train_tmp):
#     aggs = [
#         # Count the number of sentences with a length greater than i
#         *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [0,15,50,100,150,200,250,300] ], 
#         *[pl.col('sentence').filter(pl.col('sentence_len') <= i).count().alias(f"sentence_<{i}_cnt") for i in [15,50] ], 
#         # other
#         *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
#         *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
#         *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
#         *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in sentence_fea],
#         *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
#         *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
#         *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in sentence_fea],
#         *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in sentence_fea], 
#         *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in sentence_fea], 
#         ]
#     df = train_tmp.group_by(['student_id'], maintain_order=True).agg(aggs).sort("student_id")
#     df = df.to_pandas()
#     return df

# # word feature
# def Word_Preprocess(tmp):
#     # Preprocess text and use spaces to separate words from the text
#     tmp = tmp.with_columns(pl.col('text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
#     tmp = tmp.explode('word')
#     # Calculate the length of each word
#     tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
#     # Delete data with a word length of 0
#     tmp = tmp.filter(pl.col('word_len')!=0)
    
#     return tmp


# # feature_eng
# def Word_Eng(train_tmp):
#     aggs = [
#         # Count the number of words with a length greater than i+1
#         *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ], 
#         # other
#         pl.col('word_len').max().alias(f"word_len_max"),
#         pl.col('word_len').mean().alias(f"word_len_mean"),
#         pl.col('word_len').std().alias(f"word_len_std"),
#         pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
#         pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
#         pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
#         ]
#     df = train_tmp.group_by(['student_id'], maintain_order=True).agg(aggs).sort("student_id")
#     df = df.to_pandas()
#     return df


# # The preprocessing and feature engineering main function

# def preprocess_data(data, fold_indexes=None, vectorizer=None, vectorizer_cnt=None):
    
#     tmp = Paragraph_Preprocess(data)
#     train_feats = Paragraph_Eng(tmp)
    
#     # Obtain feature names
#     feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], train_feats.columns))
#     print('Features Number: ',len(feature_names))
    
#     # Sentence features
#     tmp = Sentence_Preprocess(data)
#     # Merge the newly generated feature data with the previously generated feature data
#     train_feats = train_feats.merge(Sentence_Eng(tmp), on='student_id', how='left')

#     feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], train_feats.columns))
#     print('Features Number: ',len(feature_names))
    
#     # Word features
#     tmp = Word_Preprocess(data)
#     # Merge the newly generated feature data with the previously generated feature data
#     train_feats = train_feats.merge(Word_Eng(tmp), on='student_id', how='left')

#     feature_names = list(filter(lambda x: x not in ['student_id', 'content', 'wording'], train_feats.columns))
#     print('Features Number: ',len(feature_names))
    
#     # TfidfVectorizer parameter
#     if vectorizer == None:
#         vectorizer = TfidfVectorizer(
#                     tokenizer=lambda x: x,
#                     preprocessor=lambda x: x,
#                     token_pattern=None,
#                     strip_accents='unicode',
#                     analyzer = 'word',
#                     ngram_range=(3,6),
#                     min_df=0.05,
#                     max_df=0.95,
#                     sublinear_tf=True,
#         )

#         # Fit all datasets into TfidfVector,this may cause leakage and overly optimistic CV scores
#         train_tfid = vectorizer.fit_transform([i for i in data['text']])
#     else:
#         train_tfid = vectorizer.transform([i for i in data['text']])
        
#     # Convert to array
#     dense_matrix = train_tfid.toarray()
#     # Convert to dataframe
#     df = pd.DataFrame(dense_matrix)
#     # rename features
#     tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
#     df.columns = tfid_columns
#     df['student_id'] = train_feats['student_id']
#     # Merge the newly generated feature data with the previously generated feature data
#     train_feats = train_feats.merge(df, on='student_id', how='left')

#     feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], train_feats.columns))
#     print('Features Number: ',len(feature_names))
    
#     if vectorizer_cnt == None:
        
#         vectorizer_cnt = CountVectorizer(
#                 tokenizer=lambda x: x,
#                 preprocessor=lambda x: x,
#                 token_pattern=None,
#                 strip_accents='unicode',
#                 analyzer = 'word',
#                 ngram_range=(2,3),
#                 min_df=0.10,
#                 max_df=0.85,
#         )

#         train_tfid = vectorizer_cnt.fit_transform([i for i in data['text']])
#     else:
#         train_tfid = vectorizer_cnt.transform([i for i in data['text']])
        
#     dense_matrix = train_tfid.toarray()
#     df = pd.DataFrame(dense_matrix)
#     tfid_columns = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
#     df.columns = tfid_columns
#     df['student_id'] = train_feats['student_id']
#     train_feats = train_feats.merge(df, on='student_id', how='left')

#     feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], train_feats.columns))
#     print('Features Number: ',len(feature_names))
    
#     # DeBERTa model predictions as features
#     # TODO
#     # deberta_oof = joblib.load('/kaggle/input/model_predictions.pkl')
#     # print(deberta_oof.shape, train_feats.shape)

#     # train_feats['deberta_oof'] = deberta_oof

#     # feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], train_feats.columns))
#     # print('Features Number: ', len(feature_names))   

#     # merge features 
#     if fold_indexes is not None:
#         ft = features_train.iloc[fold_indexes].drop(columns=train.columns)
#         train_feats_pandas = pd.DataFrame(train_feats.to_numpy(), columns=train_feats.columns, index=fold_indexes)
#     else:
#         ft = features_train.drop(columns=train.columns)
#         train_feats_pandas = pd.DataFrame(train_feats.to_numpy(), columns=train_feats.columns)
#     lgbm_input = pd.concat([train_feats_pandas, ft], axis=1)
    
#     feature_names = list(filter(lambda x: x not in ['student_id','content', 'wording'], lgbm_input.columns))
#     print('Features Number: ',len(feature_names))
    
#     return lgbm_input[feature_names].astype(float), feature_names, vectorizer, vectorizer_cnt


# # The loss function
# def mcrmse(y_true, y_pred):
#     assert y_true.shape == y_pred.shape
#     scores = []
#     for i in range(y_true.shape[1]):
#         scores.append(mean_squared_error(y_true[:, i], y_pred[:, i], squared=False))
#     return np.mean(scores), scores



# # Randomized search
# from sklearn.model_selection import RandomizedSearchCV

# X = train_polars.drop(columns=['content', 'wording'])
# y = train[['content', 'wording']].astype('float16')
# X, _, _, _ = preprocess_data(X)

# # Define the parameter grid for randomized search
# param_distributions = {
#     'learning_rate': [0.5, 0.01, 0.05, 0.1, 0.2, 0.3, 0.02, 0.03],
#     'max_depth': [3, 4, 5, 6],
#     'num_leaves': [5, 10, 15, 20, 25, 30, 35, 40],
#     'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.8],
#     'reg_alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0],
#     'reg_lambda': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0],
#     'n_estimators': [100, 300, 500, 700, 1000, 1500, 2000]
# }

# best_params = []
# for target in ['content', 'wording']:
    
#     gkf = GroupKFold(n_splits=4)
#     folds = gkf.split(X, y, groups=train['prompt_id'])

#     # Initialize the LightGBM regressor
#     model = lgb.LGBMRegressor(
#             objective = 'regression',
#             metrics = 'rmse',
#             learning_rate = param_distributions['learning_rate'],
#             max_depth =param_distributions['max_depth'],
#             num_leaves =param_distributions['num_leaves'],
#             colsample_bytree=param_distributions['colsample_bytree'],
#             reg_alpha =param_distributions['reg_alpha'],
#             reg_lambda =param_distributions['reg_lambda'],
#             n_estimators=param_distributions['n_estimators'],
#             random_state= random_seed,
#             extra_trees=True,
#             verbosity = -1)

#     # Initialize RandomizedSearchCV
#     random_search = RandomizedSearchCV(
#         estimator=model,
#         param_distributions=param_distributions,
#         n_iter=70,
#         scoring='neg_root_mean_squared_error',
#         cv=folds,
#         verbose=2,
#         random_state=random_seed,
#         n_jobs=-1
#     )
    
#     # Fit the randomized search
#     random_search.fit(X, y[target])
#     best_params.append((random_search.best_score_, random_search.best_params_))
#     print()
    
#     # Output the best parameters and best score for the current target
#     print(f"Best parameters found for {target}: ", random_search.best_params_)
#     print(f"Best RMSE score for {target}: ", -random_search.best_score_)
#     print()
    
#     # breaking to find hyperparams only for content LGBM
#     break
    
# print("Content best params and scores:", best_params[0])
# # print("Wording best params and scores:", best_params[1])


# def feature_select_wrapper():
#     """
#     lgm
#     :param train
#     :param test
#     :return
#     """
    
#     X = train_polars.drop(columns=['content', 'wording'])
#     y = train[['content', 'wording']].astype('float16')
#     gkf = GroupKFold(n_splits=4)
#     folds = gkf.split(X, y, groups=train['prompt_id'])
#     callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=700, first_metric_only=True)]

#     # Part 1.
#     print('feature_select_wrapper...')
    
#     models = []
#     predictions = []
#     fold_scores = []
#     fse_content = {}
#     fse_wording = {}
#     mcrmse_scores = np.array([])
    
#     for fold_idx, (train_index, test_index) in enumerate(folds):

#         print('fold', fold_idx)
#         X_train_fold, X_test_fold = X[train_index], X[test_index]
#         y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
#         X_train_fold, feature_names, vectorizer, vectorizer_cnt = preprocess_data(X_train_fold, fold_indexes=train_index)
#         X_test_fold, _, _, _ = preprocess_data(X_test_fold, fold_indexes=test_index, vectorizer=vectorizer, vectorizer_cnt=vectorizer_cnt)
        
#         features = feature_names 
        
#         fold_predictions = []
#         fold_models = []
#         for target in ['content', 'wording']:
#             y_train_target = y_train_fold[target]
#             y_test_target = y_test_fold[target]
            
#             model = lgb.LGBMRegressor(
#                     objective = 'regression',
#                     metrics = 'rmse',
#                     learning_rate = 0.01,
#                     max_depth = 3,
#                     num_leaves = 10,
#                     colsample_bytree=0.7,
#                     reg_alpha = 0.1,
#                     reg_lambda = 1.0,
#                     n_estimators=100,
#                     random_state= random_seed,
#                     extra_trees=True,
#                     verbosity = -1)

#             predictor = model.fit(X_train_fold,
#                                   y_train_target,
#                                   eval_names=['train', 'valid'],
#                                   eval_set=[(X_train_fold, y_train_target), (X_test_fold, y_test_target)],
#                                   eval_metric='rmse',
#                                   callbacks=callbacks)
            
#             fold_models.append(predictor)
#             fold_predictions.append(predictor.predict(X_test_fold))
            
#             # Aggregate feature importances
#             importances = pd.Series(predictor.feature_importances_, index=features)
#             print()
#             if target == 'content':
#                 for feature, importance in importances.items():
#                     if feature in fse_content:
#                         fse_content[feature] += importance
#                     else:
#                         fse_content[feature] = importance
#             else:
#                 for feature, importance in importances.items():
#                     if feature in fse_wording:
#                         fse_wording[feature] += importance
#                     else:
#                         fse_wording[feature] = importance
                
#         models.append(fold_models)
#         predictions.append(fold_predictions)
        
#         # Calculate mcrmse 
#         y_true_fold = np.array(y_test_fold)
#         y_pred_fold = np.array(fold_predictions).T.reshape(-1, 2)
#         fold_mcrmse, _ = mcrmse(y_true_fold, y_pred_fold)    
#         mcrmse_scores = np.append(mcrmse_scores, fold_mcrmse)
#         print(f"Fold {fold_idx} MCRMSE: {fold_mcrmse}")
#         print()
    
#     print(f"MCRMSE across all folds: {np.mean(mcrmse_scores)}")
    
#     # Convert dictionaries to Series
#     fse_content = pd.Series(fse_content)
#     fse_wording = pd.Series(fse_wording)
#     fse_aggregate = (fse_content + fse_wording) / 2
    
#     # Part 4.
#     feature_select_content = fse_content.sort_values(ascending=False).index.tolist()
#     feature_select_wording = fse_wording.sort_values(ascending=False).index.tolist()
#     feature_select_aggregate = fse_aggregate.sort_values(ascending=False).index.tolist()

#     # Combine feature importance scores into a DataFrame
#     feature_importance_df = pd.DataFrame({
#         'Feature':  fse_aggregate.index,
#         'Content_Importance': fse_content,
#         'Wording_Importance': fse_wording,
#         'Aggregate_Importance': fse_aggregate
#     }).fillna(0)
    
#     # Sort by aggregate importance and select top features
#     feature_importance_df = feature_importance_df.sort_values(by='Aggregate_Importance', ascending=False).reset_index(drop=True)
#     top_features = feature_importance_df
    
#     print('done')
#     return top_features, feature_select_content, feature_select_wording, feature_select_aggregate

# # Run feature selection
# top_features, fs_content, fs_wording, fs_aggregate = feature_select_wrapper()


# # select num of features based on the plots
# # Plot the data
# plt.figure(figsize=(12, 10))
# top_idxs = 30

# # plot options: Content_Importance, Wording_Importance, Aggregate_Importance
# importance_to_plot = 'Aggregate_Importance'
# plt.barh(top_features['Feature'][:top_idxs], top_features.sort_values(by=importance_to_plot, ascending=False).reset_index(drop=True)[importance_to_plot][:top_idxs], color='skyblue')
# plt.xlabel('Feature importance')
# plt.ylabel('Features')
# plt.title('Feature Importance')
# plt.gca().invert_yaxis()  # Invert y-axis to have the highest importance on top
# plt.show()

# # fs_content[:top_idxs]
# # fs_wording[:top_idxs]
# # fs_aggregate[:top_idxs]
# top_features.sort_values(by=importance_to_plot, ascending=False).reset_index(drop=True)[:top_idxs]


# LOAD = True # re-train
# # Define the number of splits for cross-validation
# n_splits = 4
# models = []
# if not LOAD:
#     for i in range(n_splits):
#         # TODO: Change path !!!!!!!
#         models.append(lgb.Booster(model_file=f'/kaggle/input/aes-15fold/fold_{i+1}.txt'))
# else:
#     X = train_polars.drop(columns=['content', 'wording'])
#     y = train[['content', 'wording']].astype('float16')
#     gkf = GroupKFold(n_splits=4)
#     folds = gkf.split(X, y, groups=train['prompt_id'])
#     callbacks = [log_evaluation(period=25), early_stopping(stopping_rounds=700, first_metric_only=True)]
#     predictions = []
#     mcrmse_scores = np.array([])
    
#     top_features_list = top_features['Feature'][:top_idxs]
#     # Loop through each fold of the cross-validation
#     for fold_idx, (train_index, test_index) in enumerate(folds):
        
#         print('fold', fold_idx)
#         X_train_fold, X_test_fold = X[train_index], X[test_index]
#         y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
#         X_train_fold, feature_names, vectorizer, vectorizer_cnt = preprocess_data(X_train_fold, fold_indexes=train_index)
#         X_test_fold, _, _, _ = preprocess_data(X_test_fold, fold_indexes=test_index, vectorizer=vectorizer, vectorizer_cnt=vectorizer_cnt)
        
#         # Intersect the list of top features with the columns in X_train_fold
#         existing_top_features = [feature for feature in top_features_list if feature in X_train_fold.columns]
        
#         # Using the top features only
#         X_train_fold = X_train_fold[existing_top_features]
#         X_test_fold = X_test_fold[existing_top_features]
        
#         print(X_train_fold.shape)
        
#         fold_predictions = []
#         fold_models = []
        
#         for target in ['content', 'wording']:
#             y_train_target = y_train_fold[target]
#             y_test_target = y_test_fold[target]
            
#             model = lgb.LGBMRegressor(
#                     objective = 'regression',
#                     metrics = 'rmse',
#                     learning_rate = 0.01,
#                     max_depth = 3,
#                     num_leaves = 10,
#                     colsample_bytree=0.7,
#                     reg_alpha = 0.1,
#                     reg_lambda = 1.0,
#                     n_estimators=100,
#                     random_state= random_seed,
#                     extra_trees=True,
#                     verbosity = -1)

#             predictor = model.fit(X_train_fold,
#                                   y_train_target,
#                                   eval_names=['train', 'valid'],
#                                   eval_set=[(X_train_fold, y_train_target), (X_test_fold, y_test_target)],
#                                   eval_metric='rmse',
#                                   callbacks=callbacks)
            
#             fold_models.append(predictor)
#             fold_predictions.append(predictor.predict(X_test_fold))
#             # predictor.booster_.save_model(f'fold_{fold_idx}_{target}.txt')
            
#         models.append(fold_models)
#         predictions.append(fold_predictions)
        
#         # Calculate mcrmse 
#         y_true_fold = np.array(y_test_fold)
#         y_pred_fold = np.array(fold_predictions).T.reshape(-1, 2)
#         fold_mcrmse, _ = mcrmse(y_true_fold, y_pred_fold)    
#         mcrmse_scores = np.append(mcrmse_scores, fold_mcrmse)
#         print(f"Fold {fold_idx} MCRMSE: {fold_mcrmse}")
#         print()
        
#     print(f"MCRMSE across all folds: {np.mean(mcrmse_scores)}")

### Ensemble of Seeds

In [None]:

#  preds = []
#  for i in range(NUM_OF_FOLDS):
#      if i == 0:
#          model.load_weights(f'/kaggle/input/no-aug-full-model-4epochs/no_augment_full_model_4epochs..weights.h5')
#      else:
#          model, _ = create_model(decay_steps=decay_steps)
#          model.load_weights(f'/kaggle/input/no-aug-full-model-4epochs/no_augment_full_model_seed_0.weights.h5')  
#      ids, contents, wordings = generate_predictions(model, test_data)
#      preds.append([ids, contents, wordings])
#      del model
#      gc.collect()

#  # Calculate mean predictions
#  contents = np.stack([pred[1] for pred in preds]).mean(axis=0)
#  wordings = np.stack([pred[2] for pred in preds]).mean(axis=0)