# Setup

In [1]:
import gc; gc.enable()
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

summaries = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
prompts = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
students = pd.merge(summaries, prompts, on='prompt_id', how='inner')
students.drop(["student_id", "prompt_id"], axis=1, inplace=True)

In [3]:
text = students.drop(["content", "wording"], axis=1)
score = students[["content", "wording"]]

In [4]:
text.info(), score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7165 entries, 0 to 7164
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             7165 non-null   object
 1   prompt_question  7165 non-null   object
 2   prompt_title     7165 non-null   object
 3   prompt_text      7165 non-null   object
dtypes: object(4)
memory usage: 279.9+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7165 entries, 0 to 7164
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   content  7165 non-null   float64
 1   wording  7165 non-null   float64
dtypes: float64(2)
memory usage: 167.9 KB


(None, None)

In [5]:
# del config, model, tokenizer, features
# gc.collect();

# Tokenization and Feature Extraction

In [6]:
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging
)

logging.set_verbosity_error()
logging.set_verbosity_warning()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

max_seq_length = 256
_pretrained_model = 'roberta-base'

config = AutoConfig.from_pretrained(_pretrained_model)
reberta_model = AutoModel.from_pretrained(_pretrained_model, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(_pretrained_model)

clear_output()

In [7]:
text.full_text = text.prompt_question + " " + tokenizer.sep_token + " " + text.text

In [8]:
batch_size = len(text.full_text)
sub_batch_size = min(batch_size, 500)

In [9]:
def mean_max_pooling(text, batch_size, sub_batch_size):
    all_pooled_features = []
    
    for batch_start in range(0, batch_size, sub_batch_size):
        batch_end = min(batch_start + sub_batch_size, batch_size)
        sub_batch_text = text.to_list()[batch_start:batch_end]

        features = tokenizer.batch_encode_plus(
            sub_batch_text,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_seq_length,
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        with torch.no_grad():
            inputs = {
                'input_ids': features['input_ids'].to(device),
                'attention_mask': features['attention_mask'].to(device)
            }
            outputs = reberta_model(**inputs)
        last_hidden_state = outputs[0]

        mean_pooling_embeddings = torch.mean(last_hidden_state, 1)
        _, max_pooling_embeddings = torch.max(last_hidden_state, 1)
        mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        all_pooled_features.append(mean_max_embeddings.cpu())


    # Combine the pooled features from all sub-batches
    pooled_features = torch.cat(all_pooled_features)
    
    return pooled_features

X = mean_max_pooling(text.full_text, batch_size, sub_batch_size)

# Model training

In [10]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, KFold

model_dict = {}

for s in score.columns:
    y = score[s]

    # Set up cross-validation
    num_folds = 3  # Number of folds
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Set model hyperparameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.05,
        'device': 'gpu',
    }
    
    models = []
    eval_results = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
    
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        # Train the LightGBM model
        model = lgb.train(
            params,
            train_set=dtrain,
            valid_sets=[dtrain, dval],
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=100
        )

        # Make predictions on the validation set
        y_pred = model.predict(X_val)

        # Calculate evaluation metric (e.g., RMSE)
        rmse = np.sqrt(np.mean((y_val - y_pred)**2))

        # Append evaluation result for this fold
        eval_results.append(rmse)
        
        models.append(model)
    
    model_dict[s] = models
    
    # Print RMSE scores for each fold
    print(f"{s} {num_folds} folds average RMSE score: {np.mean(eval_results)}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 339523
[LightGBM] [Info] Number of data points in the train set: 4776, number of used features: 1535
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 1534 dense feature groups (7.00 MB) transferred to GPU in 0.015448 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -0.015704
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.265252	valid_1's rmse: 0.442545
[200]	training's rmse: 0.165432	valid_1's rmse: 0.437261
[300]	training's rmse: 0.108735	valid_1's rmse: 0.436271
Early stopping, best iteration is:
[327]	training's rmse: 0.0973476	valid_1's rmse: 0.436055
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 338765
[LightGBM] [

# Inference

In [11]:
test_summaries = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")
test_prompts = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

test_students = pd.merge(test_summaries, test_prompts, on='prompt_id', how='inner')
test_students.drop(["student_id", "prompt_id"], axis=1, inplace=True)

In [12]:
test_students.full_text = test_students.prompt_question \
                          + " " + tokenizer.sep_token + " " \
                          + test_students.text

test_batch_size = len(test_students.full_text)
test_sub_batch_size = min(batch_size, 500)

X_test = mean_max_pooling(test_students.full_text, test_batch_size, test_sub_batch_size)

In [13]:
prediction = {}

for s in score.columns:
    pred_1 = model_dict[s][0].predict(X_test)
    pred_2 = model_dict[s][1].predict(X_test)
    pred_3 = model_dict[s][2].predict(X_test)
    
    stacked_predictions = np.column_stack((pred_1, pred_2, pred_3))
    average_prediction = np.mean(stacked_predictions, axis=1)
    
    prediction[s] = average_prediction

In [14]:
submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

submission["content"] = prediction["content"]
submission["wording"] = prediction["wording"]

submission.to_csv("submission.csv", index=False)

submission

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.142366,-0.863693
1,111111eeeeee,-1.092005,-0.847094
2,222222cccccc,-1.218312,-1.0339
3,333333dddddd,-1.192363,-0.974076
