In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
import torch.nn.functional as F
import os

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np

# 데이터 파일 경로
file_path = '/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/final_dataset_test.csv'

# 데이터 읽어오기
data = pd.read_csv(file_path)

X = data.drop(columns=['MSCI_Score'])

# y 값은 MSCI_Score
y = data['MSCI_Score']

# KoELECTRA 모델의 토크나이저 로드
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')

# 데이터셋 클래스 정의
class FeatureDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_len):
        # Features 처리
        if isinstance(features, pd.DataFrame):
            self.features = features.reset_index(drop=True)
        else:
            self.features = features  # 리스트나 다른 형식일 경우 그대로 사용

        # Labels 처리
        if isinstance(labels, (pd.Series, pd.DataFrame)):
            self.labels = labels.reset_index(drop=True)
        else:
            self.labels = labels  # 리스트나 다른 형식일 경우 그대로 사용

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Features에서 텍스트와 수치형 피처 추출
        if isinstance(self.features, pd.DataFrame):
            text = self.features.iloc[idx]['full_text']
            numeric_features = self.features.iloc[idx].drop(labels=['full_text', 'Company', 'date', 'Year']).astype(float).values
        else:
            # features가 리스트인 경우 (예: 리스트의 딕셔너리)
            text = self.features[idx]['full_text']
            # 'full_text', 'Company', 'date', 'Year'를 제외한 수치형 피처 추출
            numeric_features = [float(value) for key, value in self.features[idx].items() if key not in ['full_text', 'Company', 'date', 'Year']]
            numeric_features = np.array(numeric_features)

        # Labels에서 라벨 추출
        if isinstance(self.labels, (pd.Series, pd.DataFrame)):
            label = self.labels.iloc[idx]
        else:
            label = self.labels[idx]

        # 텍스트 토큰화 (clean_up_tokenization_spaces 제거)
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numeric_features': torch.tensor(numeric_features, dtype=torch.float),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# 학습셋과 검증셋으로 분리
train_features, val_features, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 탐색을 위한 값들 설정
max_len_values = [128, 256]
batch_size_values = [16, 32]
learning_rate_values = [2e-5, 3e-5, 5e-5]
num_epochs = 50

# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 텐서를 연속적으로 변환하는 함수
def make_contiguous(model):
    for param in model.parameters():
        param.data = param.data.contiguous()

# 훈련 함수 정의
def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numeric_features = batch['numeric_features'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # KoELECTRA 모델의 텍스트 출력
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()

        # 텍스트 출력과 수치형 피처 결합
        combined_output = torch.cat((logits.unsqueeze(1), numeric_features), dim=1)

        # 손실 계산 (MSE Loss)
        loss = F.mse_loss(combined_output.sum(dim=1), labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_dataloader)

# 평가 함수 정의
def evaluate(model, val_dataloader, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numeric_features = batch['numeric_features'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()

            combined_output = torch.cat((logits.unsqueeze(1), numeric_features), dim=1)
            loss = F.mse_loss(combined_output.sum(dim=1), labels)

            total_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(combined_output.sum(dim=1).cpu().numpy())

    return total_loss / len(val_dataloader), all_labels, all_preds

# 하이퍼파라미터 탐색 루프
for max_len in max_len_values:
    for batch_size in batch_size_values:
        for learning_rate in learning_rate_values:
            print(f"Training with max_len={max_len}, batch_size={batch_size}, learning_rate={learning_rate}")

            # 데이터셋 객체 생성
            train_dataset = FeatureDataset(train_features, train_labels, tokenizer, max_len)
            val_dataset = FeatureDataset(val_features, val_labels, tokenizer, max_len)

            # DataLoader 생성
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

            # KoELECTRA 모델 로드
            model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=1)
            model.to(device)
            optimizer = AdamW(model.parameters(), lr=learning_rate)

            # 조기 종료를 위한 변수
            best_val_loss = float('inf')
            patience = 5
            patience_counter = 0

            # 훈련 및 평가
            for epoch in range(num_epochs):
                print(f"Epoch {epoch + 1}/{num_epochs}")
                train_loss = train(model, train_dataloader, optimizer, device)
                val_loss, val_labels_out, val_preds = evaluate(model, val_dataloader, device)
                print(f"Training Loss: {train_loss:.4f}")
                print(f"Validation Loss: {val_loss:.4f}")

                if val_loss < best_val_loss:
                    print(f"Validation loss improved from {best_val_loss:.4f} to {val_loss:.4f}. Saving model...")
                    best_val_loss = val_loss
                    patience_counter = 0

                    # 최선의 모델 저장
                    best_model_dir = f"/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/best_model_len{max_len}_batch{batch_size}_lr{learning_rate}"
                    if not os.path.exists(best_model_dir):
                        os.makedirs(best_model_dir)
                    make_contiguous(model)
                    model.save_pretrained(best_model_dir)
                    tokenizer.save_pretrained(best_model_dir)
                    print(f"Best model and tokenizer saved to {best_model_dir}")
                else:
                    patience_counter += 1
                    print(f"Validation loss did not improve. Patience counter: {patience_counter}/{patience}")

                if patience_counter >= patience:
                    print("Early stopping triggered. Stopping training...")
                    break




Training with max_len=128, batch_size=16, learning_rate=2e-05


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50


Training: 100%|██████████| 303/303 [00:57<00:00,  5.25it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.36it/s]


Training Loss: 5.8328
Validation Loss: 2.8752
Validation loss improved from inf to 2.8752. Saving model...
Best model and tokenizer saved to /content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/best_model_len128_batch16_lr2e-05
Epoch 2/50


Training: 100%|██████████| 303/303 [00:58<00:00,  5.14it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.22it/s]


Training Loss: 2.4812
Validation Loss: 2.3825
Validation loss improved from 2.8752 to 2.3825. Saving model...
Best model and tokenizer saved to /content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/best_model_len128_batch16_lr2e-05
Epoch 3/50


Training: 100%|██████████| 303/303 [00:58<00:00,  5.15it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.28it/s]


Training Loss: 1.9903
Validation Loss: 2.3327
Validation loss improved from 2.3825 to 2.3327. Saving model...
Best model and tokenizer saved to /content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/best_model_len128_batch16_lr2e-05
Epoch 4/50


Training: 100%|██████████| 303/303 [00:58<00:00,  5.16it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.21it/s]


Training Loss: 1.6659
Validation Loss: 2.5637
Validation loss did not improve. Patience counter: 1/5
Epoch 5/50


Training: 100%|██████████| 303/303 [00:58<00:00,  5.15it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.31it/s]


Training Loss: 1.3392
Validation Loss: 2.4069
Validation loss did not improve. Patience counter: 2/5
Epoch 6/50


Training: 100%|██████████| 303/303 [00:58<00:00,  5.15it/s]
Evaluating: 100%|██████████| 76/76 [00:10<00:00,  7.26it/s]


Training Loss: 1.1364
Validation Loss: 2.3098
Validation loss improved from 2.3327 to 2.3098. Saving model...
Best model and tokenizer saved to /content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/best_model_len128_batch16_lr2e-05
Epoch 7/50


Training:  86%|████████▋ | 262/303 [00:51<00:07,  5.27it/s]

In [None]:
# 저장된 모델과 토크나이저 경로
output_dir = "/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/"

# 모델 로드
model = ElectraForSequenceClassification.from_pretrained(output_dir)
model.eval()  # 평가 모드로 전환

# 토크나이저 로드
tokenizer = ElectraTokenizer.from_pretrained(output_dir)

# 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 새로운 CSV 파일 읽기
new_file_path = '/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/sample_dataset_test.csv'
new_data = pd.read_csv(new_file_path)

# 데이터셋 클래스 정의 (훈련에서 사용한 클래스와 동일해야 함)
class FeatureDataset(Dataset):
    def __init__(self, features, tokenizer, max_len):
        self.features = features
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # 텍스트와 그 외의 수치형 피처들을 분리
        text = self.features.iloc[idx]['full_text']

        # 수치형 피처들만 선택 (숫자 데이터만 선택)
        numeric_features = self.features.iloc[idx].drop(labels=['full_text', 'Company', 'date', 'Year']).astype(float).values

        # 텍스트 토큰화
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # 반환
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numeric_features': torch.tensor(numeric_features, dtype=torch.float),
        }

# 파라미터 설정
max_len = 128
batch_size = 16

# 데이터셋 객체 생성
predict_dataset = FeatureDataset(new_data, tokenizer, max_len)

# DataLoader 생성
predict_dataloader = DataLoader(predict_dataset, batch_size=batch_size, shuffle=False)

# 예측 함수 정의
def predict(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numeric_features = batch['numeric_features'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()

            combined_output = torch.cat((logits.unsqueeze(1), numeric_features), dim=1)
            final_output = combined_output.sum(dim=1)  # 예: 간단히 합산하여 최종 출력 계산

            predictions.extend(final_output.cpu().numpy())

    return predictions

# 예측 수행
predictions = predict(model, predict_dataloader, device)

# 예측 값 분포 확인
predicted_label_distribution = pd.Series(predictions).value_counts(bins=10)
print("Predicted label distribution:")
print(predicted_label_distribution)


Predicting: 100%|██████████| 441/441 [01:25<00:00,  5.13it/s]

Predicted label distribution:
(3.645, 4.828]      1533
(1.279, 2.462]      1502
(4.828, 6.011]      1348
(2.462, 3.645]       955
(0.0965, 1.279]      873
(6.011, 7.194]       509
(-1.086, 0.0965]     155
(7.194, 8.377]       148
(8.377, 9.56]         15
(-2.282, -1.086]      12
Name: count, dtype: int64





In [None]:
# 예측 결과를 Pandas Series로 변환
predictions_series = pd.Series(predictions)

# 결측치 확인
missing_values = predictions_series.isnull().sum()
print(f"Number of missing values in predictions: {missing_values}")

Number of missing values in predictions: 0


In [None]:
# 예측 결과를 데이터프레임에 추가
new_data['predicted_label'] = predictions

# 예측 결과가 포함된 데이터 저장
output_file_path = '/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/결과/predictions.csv'
new_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"Predictions saved to {output_file_path}")


Predictions saved to /content/drive/MyDrive/Kwargs/모델B1/결과/predictions.csv


In [None]:
# CSV 파일 경로
file_path = '/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/결과/predictions.csv'

# 데이터 읽어오기
data = pd.read_csv(file_path)

# 필요한 열만 선택
data = data[['Year', 'Company', 'predicted_label']]

# Company와 Year로 그룹화한 뒤 predicted_label의 평균 계산
grouped_data = data.groupby(['Company', 'Year']).agg(average_label=('predicted_label', 'mean')).reset_index()
print(grouped_data.to_string())

     Company  Year  average_label
0       KB금융  2019       5.016282
1       KB금융  2020       5.109671
2       KB금융  2021       5.996064
3       KB금융  2022       6.966534
4       KB금융  2023       7.012989
5      NAVER  2019       4.815658
6      NAVER  2020       4.797409
7      NAVER  2021       6.733145
8      NAVER  2022       6.642238
9      NAVER  2023       5.643522
10    SK하이닉스  2019       2.097431
11    SK하이닉스  2020       3.649572
12    SK하이닉스  2021       3.710508
13    SK하이닉스  2022       4.903346
14    SK하이닉스  2023       4.920477
15        기아  2019       1.796560
16        기아  2020       1.301239
17        기아  2021       2.058400
18        기아  2022       1.960288
19        기아  2023       2.774900
20     삼성SDI  2019       4.350137
21     삼성SDI  2020       4.419508
22     삼성SDI  2021       4.213435
23     삼성SDI  2022       4.739213
24     삼성SDI  2023       4.776360
25      삼성물산  2019       3.249006
26      삼성물산  2020       4.275316
27      삼성물산  2021       4.703798
28      삼성물산  

In [None]:
# 결과를 새로운 CSV 파일로 저장
output_file_path = '/content/drive/MyDrive/Kwargs/050. 기관 점수 예측 모델 (모델 B1)/결과/average_predictions.csv'
grouped_data.to_csv(output_file_path, index=False)

print(f"결과가 {output_file_path}에 저장되었습니다.")

결과가 /content/drive/MyDrive/Kwargs/모델B1/결과/average_predictions.csv에 저장되었습니다.
