In [1]:
import os
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier
from collections import Counter
from nltk import ngrams
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
split_df=pd.read_csv("../data/train_paragraph.csv")

In [3]:
split_df.head()

Unnamed: 0,title,paragraph_index,paragraph_text,generated
0,카호올라웨섬,0,카호올라웨섬은 하와이 제도를 구성하는 8개의 화산섬 가운데 하나로 면적은 115.5...,0
1,카호올라웨섬,1,마우이섬에서 남서쪽으로 약 11km 정도 떨어진 곳에 위치하며 라나이섬의 남동쪽에 ...,0
2,카호올라웨섬,2,1000년경부터 사람이 거주했으며 해안 지대에는 소규모 임시 어촌이 형성되었다. 섬...,0
3,카호올라웨섬,3,1830년대에는 하와이 왕국의 카메하메하 3세 국왕에 의해 남자 죄수들의 유형지로 ...,0
4,카호올라웨섬,4,1910년부터 1918년까지 하와이 준주가 섬의 원래 모습을 복원하기 위해 이 섬을...,0


In [4]:
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(35000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0

In [5]:
def get_cls_embedding(texts, tokenizer, model, device, max_length=256):
    embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Embedding"):
            inputs = tokenizer(
                text, return_tensors='pt', truncation=True,
                max_length=max_length, padding='max_length'
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            cls_emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_emb)
    return np.vstack(embeddings)

In [14]:
def get_meanpool_embedding_batch(texts, tokenizer, model, device, max_length=256, batch_size=32):
    embeddings = []
    # model.eval()
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding (batch)"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            truncation=True,
            max_length=max_length,
            padding='max_length'
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden = outputs.last_hidden_state  # (batch, seq_len, hidden_dim)
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden.size()).float()
            summed = (last_hidden * mask).sum(dim=1)
            counts = mask.sum(dim=1)
            mean_pooled = (summed / counts).cpu().numpy()  # (batch, hidden_dim)
            embeddings.append(mean_pooled)
    return np.vstack(embeddings)


In [None]:
split_df['paragraph_text_emb'] = list(get_meanpool_embedding_batch(split_df['paragraph_text'].tolist(), tokenizer, model, device))

In [21]:
split_df.to_pickle("../data/train_paragraph_emb.pkl")

In [22]:
with open("../data/train_paragraph_emb.pkl","rb") as f:
    split_df=pickle.load(f)