In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Data Load & Split

In [2]:
class Dataset:
    def __init__(self, train_file_name, test_file_name):
        self.train_file_name = train_file_name
        self.test_file_name = test_file_name
        self.text_list, self.feature_list = None, None
        self.train, self.test = None, None

        print(f"[Init] Dataset initialized with train: {train_file_name}, test: {test_file_name}")

    def get_cls_embedding_batch(self, texts, max_length=256, batch_size=32):
        print(f"[Embedding] Start embedding {len(texts)} texts (batch size: {batch_size})")
        MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME)
        model.eval()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        embeddings = []
        with torch.no_grad():
            for i in tqdm(range(0, len(texts), batch_size), desc="[Embedding batches]"):
                batch_texts = texts[i:i+batch_size]
                inputs = tokenizer(
                    batch_texts,
                    return_tensors='pt',
                    truncation=True,
                    max_length=max_length,
                    padding='max_length'
                )
                inputs = {k: v.to(device) for k, v in inputs.items()}
                outputs = model(**inputs)
                cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(cls_emb)
        embeddings = np.vstack(embeddings) 
        print(f"[Embedding] Completed. Shape: {embeddings.shape}")
        return embeddings
    
    def load_train_csv(self):
        print(f"[Load] Loading train CSV: {self.train_file_name}")
        train = pd.read_csv(self.train_file_name)
        print(f"[Load] Loading test CSV: {self.test_file_name}")
        test = pd.read_csv(self.test_file_name)
        print(f"[Load] Loaded train shape: {train.shape}, test shape: {test.shape}")
        return train, test

    def load_train_pickle(self):
        print(f"[Load] Loading train pickle: {self.train_file_name}")
        with open(self.train_file_name, "rb") as f:
            train = pickle.load(f)
        print(f"[Load] Loading test pickle: {self.test_file_name}")
        with open(self.test_file_name, "rb") as f:
            test = pickle.load(f)
        self.train, self.test = train, test
        print(f"[Load] train_col_list: {list(train.columns)}")
        print(f"[Load] test_col_list: {list(test.columns)}")
        return train, test

    def set_list(self, emb_list, feature_list, text_list=None):
        self.emb_list = emb_list
        self.feature_list = feature_list
        self.text_list = text_list
        print(f"[Set] emb_list: {emb_list}, feature_list: {feature_list}, text_list: {text_list}")

    def split_data(self, train):
        print(f"[Split] Splitting data with stratify on 'generated'")
        col_list = self.emb_list + self.feature_list + self.text_list
        X = train[col_list]
        y = train['generated']
        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
        print(f"[Split] X_train: {X_train.shape}, X_val: {X_val.shape}")
        return X_train, X_val, y_train, y_val
    
    def get_text_emb(self, text_list):
        if text_list is None:
            text_list = self.text_list
        text_set = []
        print(f"[Text Embedding] Extracting embeddings for: {text_list}")
        for text in text_list:
            print(f"[Text Embedding] Embedding column: {text}")
            tmp = self.get_cls_embedding_batch(self.train[text].tolist())
            text_set.append(tmp)
        print(f"[Text Embedding] All embeddings extracted. Shape: {[arr.shape for arr in text_set]}")
        return np.hstack(text_set)

    def df_load_train_text_emb(self, X_train, X_val, emb_list):
        print(f"[DF Load] Loading train/val text embeddings for: {emb_list}")
        train_matrix_set = []
        val_matrix_set = []
        for col in emb_list:
            print(f"[DF Load] Processing train column: {col}")
            tmp = np.vstack(X_train[col].tolist())
            train_matrix_set.append(tmp)
        for col in emb_list:
            print(f"[DF Load] Processing val column: {col}")
            tmp = np.vstack(X_val[col].tolist())
            val_matrix_set.append(tmp)
        train_text_matrix = np.hstack(train_matrix_set)
        val_text_matrix = np.hstack(val_matrix_set)
        print(f"[DF Load] train_text_matrix shape: {train_text_matrix.shape}, val_text_matrix shape: {val_text_matrix.shape}")
        return train_text_matrix, val_text_matrix
    
    def df_load_test_text_emb(self, X_test, emb_list):
        if emb_list is None:
            emb_list = self.emb_list
        print(f"[DF Load Test] Loading test text embeddings for: {emb_list}")
        test_matrix_set = []
        for col in emb_list:
            print(f"[DF Load Test] Processing test column: {col}")
            tmp = np.vstack(X_test[col].tolist())
            test_matrix_set.append(tmp)
        test_text_matrix = np.hstack(test_matrix_set)
        print(f"[DF Load Test] test_text_matrix shape: {test_text_matrix.shape}")
        return test_text_matrix
    
    def concat_train_feature(self, X_train, X_val, train_text_matrix, 
                                                    val_text_matrix, 
                                                    feature_list):
        print(f"[Concat] Concatenating features: {feature_list}")
        train_feature_matrix = X_train[feature_list].to_numpy()
        train_full_matrix = np.hstack([train_text_matrix, train_feature_matrix])
        val_feature_matrix = X_val[feature_list].to_numpy()
        val_full_matrix = np.hstack([val_text_matrix, val_feature_matrix])
        print(f"[Concat] train_full_matrix shape: {train_full_matrix.shape}, val_full_matrix shape: {val_full_matrix.shape}")
        return train_full_matrix, val_full_matrix

    def concat_test_feature(self, test_text_matrix, feature_list):
        print(f"[Concat Test] Concatenating test features: {feature_list}")
        test_feature_matrix = self.test[feature_list].to_numpy()
        test_full_matrix = np.hstack([test_text_matrix, test_feature_matrix])
        print(f"[Concat Test] test_full_matrix shape: {test_full_matrix.shape}")
        return test_full_matrix

In [3]:
dataset = Dataset("../data/train_emb_change.pkl", "../data/test_emb_change.pkl")
train, test = dataset.load_train_pickle()

[Init] Dataset initialized with train: ../data/train_emb_change.pkl, test: ../data/test_emb_change.pkl
[Load] Loading train pickle: ../data/train_emb_change.pkl
[Load] Loading test pickle: ../data/test_emb_change.pkl
[Load] train_col_list: ['paragraph_index', 'generated', 'paragraph_text_emb', 'full_text_emb', 'adj_emb_change', 'title_emb']
[Load] test_col_list: ['ID', 'title', 'paragraph_index', 'paragraph_text', 'paragraph_text_emb', 'title_emb', 'adj_emb_change']


In [4]:
text_list=[]
feature_list=["adj_emb_change"]
emb_list=["full_text_emb","title_emb"]
dataset.set_list(emb_list, feature_list, text_list)

[Set] emb_list: ['full_text_emb', 'title_emb'], feature_list: ['adj_emb_change'], text_list: []


In [5]:
X_train, X_val, y_train, y_val = dataset.split_data(train)

[Split] Splitting data with stratify on 'generated'
[Split] X_train: (981091, 3), X_val: (245273, 3)


In [6]:
train_text_matrix, val_text_matrix = dataset.df_load_train_text_emb(X_train, X_val, emb_list)

[DF Load] Loading train/val text embeddings for: ['full_text_emb', 'title_emb']
[DF Load] Processing train column: full_text_emb
[DF Load] Processing train column: title_emb
[DF Load] Processing val column: full_text_emb
[DF Load] Processing val column: title_emb
[DF Load] train_text_matrix shape: (981091, 1536), val_text_matrix shape: (245273, 1536)


In [7]:
train_full_matrix, val_full_matrix= dataset.concat_train_feature(X_train, X_val,
                                                                 train_text_matrix, 
                                                                 val_text_matrix, 
                                                                 feature_list)

[Concat] Concatenating features: ['adj_emb_change']
[Concat] train_full_matrix shape: (981091, 1537), val_full_matrix shape: (245273, 1537)


## Train

In [8]:
test.rename(columns={"paragraph_text_emb":"full_text_emb"}, inplace=True)

In [9]:
test_text_matrix = dataset.df_load_test_text_emb(test, emb_list)
test_full_matrix = dataset.concat_test_feature(test_text_matrix, feature_list)

[DF Load Test] Loading test text embeddings for: ['full_text_emb', 'title_emb']
[DF Load Test] Processing test column: full_text_emb
[DF Load Test] Processing test column: title_emb
[DF Load Test] test_text_matrix shape: (1962, 1536)
[Concat Test] Concatenating test features: ['adj_emb_change']
[Concat Test] test_full_matrix shape: (1962, 1537)


In [None]:
xgb = XGBClassifier(random_state=42)
xgb.fit(train_full_matrix, y_train)

In [11]:
val_probs = xgb.predict_proba(val_full_matrix)[:, 1]
auc = roc_auc_score(y_val, val_probs)
print(f"Validation AUC: {auc:.4f}")

Validation AUC: 0.9918


## Inference

In [12]:
probs = xgb.predict_proba(test_full_matrix)[:, 1]

## Submission

In [None]:
sample_submission = pd.read_csv('../data/sample_submission.csv', encoding='utf-8-sig')
sample_submission['generated'] = probs

sample_submission.to_csv(f'../output/baseline_submission_change.csv', index=False)

: 