In [1]:
# Necessary library installation
!pip install transformers torch
!pip install --upgrade gluonnlp pandas tqdm
!pip install accelerate -U

#Library Import
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler, random_split
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
from transformers import ElectraModel, ElectraTokenizer
import numpy as np
from tqdm.auto import tqdm
from google.colab import drive
import glob
import os
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, mean_squared_error

# Google Drive Mount
drive.mount('/content/drive')

Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m117.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl size=661659 sha256=caee3433dfa9acbdbd35bccf31563c08be3b4ad09605256e73f410da00f9879f
  Stored in directory: /root/.cache/pip/wheels/1a/1e/0d/99f55911d90f2b95b9f7c176d5813ef3622894a4b30fde6bd3
Successfully built gluonn

#Training and evaluation

In [None]:
import os
import torch
import pandas as pd
from transformers import ElectraTokenizer, ElectraModel, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, SequentialSampler
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import torch.nn as nn

# KoelectRa Talk Niser and Model
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator")

# Definition of model for regression
class ElectraForCompanyRegression(nn.Module):
    def __init__(self, model_name):
        super(ElectraForCompanyRegression, self).__init__()
        self.electra = ElectraModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.electra.config.hidden_size + 768, 1)

    def forward(self, input_ids, attention_mask, company_embeddings):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        combined_output = torch.cat((pooled_output, company_embeddings), dim=1)
        combined_output = self.dropout(combined_output)
        logits = self.regressor(combined_output)
        return logits

# Creation of corporate name embedding
def get_company_embedding(company_name, tokenizer, model):
    inputs = tokenizer(company_name, return_tensors="pt").to('cuda')  # Ensure input is on GPU
    with torch.no_grad():
        outputs = model.electra(**inputs)
    company_embedding = outputs.last_hidden_state[:, 0, :]
    return company_embedding

def preprocess_with_company(data, tokenizer, model, max_len=256):
    texts = data["full_text"].astype(str).tolist()
    labels = data["evaluation_1"].fillna(0).astype(float).tolist()
    company_names = data["기업명"].astype(str).tolist()

    inputs = tokenizer(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to('cuda')  # Ensure inputs are on GPU

    labels = torch.tensor(labels, dtype=torch.float).to('cuda')  # Move labels to GPU

    # Creation of corporate name embedding 및 병합
    company_embeddings = []
    for company_name in tqdm(company_names, desc="Generating company embeddings"):
        embedding = get_company_embedding(company_name, tokenizer, model).to('cuda')
        company_embeddings.append(embedding)
    company_embeddings = torch.cat(company_embeddings, dim=0)

    return inputs, labels, company_embeddings

# Dataset Road
file_path = '/content/drive/MyDrive/Kwargs/적합성/anonymized_data.csv'
data = pd.read_csv(file_path)

#Model initialization
model_name = "monologg/koelectra-base-discriminator"
model = ElectraForCompanyRegression(model_name).to('cuda')  # Ensure model is on GPU

# Data pretreatment
print("Preprocessing data...")
inputs, labels, company_embeddings = preprocess_with_company(data, tokenizer, model)

# Tensordataset creation
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], company_embeddings, labels)

# Dataset split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loader
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Move all tensors to GPU
def to_device(batch, device):
    return [x.to(device) for x in batch]

# Training function
def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        batch = to_device(batch, device)
        input_ids, attention_mask, company_embeddings, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, company_embeddings).squeeze()
        loss = nn.MSELoss()(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_dataloader)

# Evaluation function
def evaluate(model, val_dataloader, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            batch = to_device(batch, device)
            input_ids, attention_mask, company_embeddings, labels = batch
            logits = model(input_ids, attention_mask, company_embeddings).squeeze()
            loss = nn.MSELoss()(logits, labels)
            total_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(logits.cpu().numpy())
    mse = mean_squared_error(all_labels, all_preds)
    return total_loss / len(val_dataloader), mse

# Training and evaluation loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_dataloader, optimizer, 'cuda')
    val_loss, val_mse = evaluate(model, val_dataloader, 'cuda')
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation MSE: {val_mse:.4f}")

#Save model
    epoch_output_dir = f"/content/drive/MyDrive/Kwargs/적합성/모델_epoch_{epoch + 1}"
    if not os.path.exists(epoch_output_dir):
        os.makedirs(epoch_output_dir)

#Save model weight
    torch.save(model.state_dict(), os.path.join(epoch_output_dir, "model.pt"))

#Save talk nisor
    tokenizer.save_pretrained(epoch_output_dir)
    print(f"Model and tokenizer saved to {epoch_output_dir} at Epoch {epoch + 1}")


Preprocessing data...


Generating company embeddings: 100%|██████████| 85771/85771 [14:00<00:00, 102.00it/s]


Epoch 1/3


Training: 100%|██████████| 4289/4289 [12:02<00:00,  5.94it/s]
Evaluating: 100%|██████████| 1073/1073 [00:57<00:00, 18.56it/s]


Training Loss: 0.0868
Validation Loss: 0.0785
Validation MSE: 0.0785
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/적합성/모델_epoch_1 at Epoch 1
Epoch 2/3


Training: 100%|██████████| 4289/4289 [12:02<00:00,  5.94it/s]
Evaluating: 100%|██████████| 1073/1073 [00:57<00:00, 18.55it/s]


Training Loss: 0.1168
Validation Loss: 0.1231
Validation MSE: 0.1231
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/적합성/모델_epoch_2 at Epoch 2
Epoch 3/3


Training: 100%|██████████| 4289/4289 [12:02<00:00,  5.94it/s]
Evaluating:  94%|█████████▍| 1006/1073 [00:54<00:03, 18.51it/s]

# Evaluation of low data with the completed model

In [2]:
import torch
import pandas as pd
from transformers import ElectraTokenizer
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from tqdm.auto import tqdm

# Import the model definition class (assuming to be defined above)
class ElectraForCompanyRegression(nn.Module):
    def __init__(self, model_name):
        super(ElectraForCompanyRegression, self).__init__()
        self.electra = ElectraModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.electra.config.hidden_size + 768, 1)

    def forward(self, input_ids, attention_mask, company_embeddings):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        combined_output = torch.cat((pooled_output, company_embeddings), dim=1)
        combined_output = self.dropout(combined_output)
        logits = self.regressor(combined_output)
        return logits

# Model and Talk Niser Road
model_path = '/content/drive/MyDrive/Kwargs/적합성/모델_epoch_1'
model_name = "monologg/koelectra-base-discriminator"
model = ElectraForCompanyRegression(model_name)
model.load_state_dict(torch.load(f"{model_path}/model.pt"))
model.to('cuda')
model.eval()

tokenizer = ElectraTokenizer.from_pretrained(model_path)

# Creation of corporate name embedding
def get_company_embedding(company_name, tokenizer, model):
    inputs = tokenizer(company_name, return_tensors="pt").to('cuda')  # Ensure input is on GPU
    with torch.no_grad():
        outputs = model.electra(**inputs)
    company_embedding = outputs.last_hidden_state[:, 0, :]
    return company_embedding

# New data pretreatment
def preprocess_with_company(data, tokenizer, model, max_len=256):
    texts = data["full_text"].astype(str).tolist()
    company_names = data["기업명"].astype(str).tolist()

    inputs = tokenizer(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to('cuda')  # Ensure inputs are on GPU

    # Creation of corporate name embedding 및 병합
    company_embeddings = []
    for company_name in tqdm(company_names, desc="Generating company embeddings"):
        embedding = get_company_embedding(company_name, tokenizer, model).to('cuda')
        company_embeddings.append(embedding)
    company_embeddings = torch.cat(company_embeddings, dim=0)

    return inputs, company_embeddings

# New CSV Data Road
file_path = '/content/drive/MyDrive/Kwargs/뉴스 크롤링/csv/LG에너지솔루션_news_data.csv'  # 예측할 CSV 파일 경로
data = pd.read_csv(file_path)

# Data pretreatment
print("Preprocessing data...")
inputs, company_embeddings = preprocess_with_company(data, tokenizer, model)

# Tensordataset creation
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], company_embeddings)

# Create data loader
dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=16)

# Prediction function
def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            batch = [x.to(device) for x in batch]
            input_ids, attention_mask, company_embeddings = batch
            logits = model(input_ids, attention_mask, company_embeddings).squeeze()
            all_preds.extend(logits.cpu().numpy())
    return all_preds

# Prediction using a model
print("Predicting labels for new data...")
predictions = predict(model, dataloader, 'cuda')

#Save the prediction result with the original data
data["evaluation_1"] = predictions
output_path = '/content/drive/MyDrive/Kwargs/적합성/company_related_LG에너지솔루션.csv'
data.to_csv(output_path, index=False)

print(f"Predicted results saved to {output_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  model.load_state_dict(torch.load(f"{model_path}/model.pt"))


Preprocessing data...


KeyError: 'full_text'