In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
import os

# Read the CSV file
file_path = '/content/drive/MyDrive/Kwargs/combined_labeled_data_balanced_0719.csv'
data = pd.read_csv(file_path)

# Increase the label value by 10 and convert it to a value between 0 and 20
data['label'] = ((data['label'] + 1) * 10).astype(int)

# Text and label extraction
texts = data['text'].tolist()
labels = data['label'].tolist()

#KoelectRA Talk Niser Road
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')

# Dataset class definition
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Separation of learning set and verification set
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Parameter setting
max_len = 128
batch_size = 32  # 배치 크기 증가
learning_rate = 3e-5  # 학습률 감소

# Creation of dataset objects
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)

#Dataloader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Koelectra Model Road
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=21)

# setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

def label_smoothing(targets, num_classes, smoothing=0.1):
    confidence = 1.0 - smoothing
    smooth_value = smoothing / num_classes
    with torch.no_grad():
        one_hot = torch.zeros(size=(targets.size(0), num_classes), device=targets.device)
        one_hot.fill_(smooth_value)
        targets = one_hot.scatter_(1, targets.unsqueeze(1).long(), confidence)
    return targets

# Training function correction
def train(model, train_dataloader, optimizer, device, num_classes, smoothing=0.1):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        smoothed_labels = label_smoothing(labels, num_classes, smoothing)
        loss = F.kl_div(F.log_softmax(logits, dim=-1), smoothed_labels, reduction='batchmean')

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_dataloader)

# Evaluation function correction
def evaluate(model, val_dataloader, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(val_dataloader), accuracy

# Training and evaluation
num_epochs = 3
num_classes = 21  # 클래스 수
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_dataloader, optimizer, device, num_classes)
    val_loss, val_accuracy = evaluate(model, val_dataloader, device)
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")

#Save model for each Epoch
    epoch_output_dir = f"/content/drive/MyDrive/Kwargs/esg 긍부정도/모델_epoch_{epoch + 1}"

# Create if there is no directory
    if not os.path.exists(epoch_output_dir):
        os.makedirs(epoch_output_dir)

#Save model
    model.save_pretrained(epoch_output_dir)
    tokenizer.save_pretrained(epoch_output_dir)

    print(f"Model and tokenizer saved to {epoch_output_dir} at Epoch {epoch + 1}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████| 375/375 [05:20<00:00,  1.17it/s]
Evaluating: 100%|██████████| 94/94 [00:39<00:00,  2.36it/s]


Training Loss: 0.1429
Validation Loss: 0.0778
Validation Accuracy: 0.9997
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg 긍부정도/모델_epoch_1 at Epoch 1
Epoch 2/3


Training: 100%|██████████| 375/375 [05:29<00:00,  1.14it/s]
Evaluating: 100%|██████████| 94/94 [00:40<00:00,  2.34it/s]


Training Loss: -0.0012
Validation Loss: 0.0777
Validation Accuracy: 1.0000
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg 긍부정도/모델_epoch_2 at Epoch 2
Epoch 3/3


Training: 100%|██████████| 375/375 [05:28<00:00,  1.14it/s]
Evaluating: 100%|██████████| 94/94 [00:39<00:00,  2.35it/s]


Training Loss: 0.0045
Validation Loss: 0.0788
Validation Accuracy: 1.0000
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg 긍부정도/모델_epoch_3 at Epoch 3


In [None]:
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification

#Save model and torque nisor path
output_dir = "/content/drive/MyDrive/Kwargs/esg 긍부정도/모델_epoch_2"

#Model Road
model = ElectraForSequenceClassification.from_pretrained(output_dir)
model.eval()  # 평가 모드로 전환

#Talk Nizor Road
tokenizer = ElectraTokenizer.from_pretrained(output_dir)

# setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [None]:
import pandas as pd
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
import os

# Read a new CSV file
new_file_path = '/content/drive/MyDrive/Kwargs/esg관련도/esg_related_포스코 홀딩스_processed.csv'
new_data = pd.read_csv(new_file_path)

# Text extraction
texts = new_data['content'].astype(str).tolist()

# Definition of predictive function
def predict_label(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        return predicted_class

predictions = []
for text in texts:
    predicted_class = predict_label(text, model, tokenizer, device)
# Convert the prediction result to -1.0, -0.9, ..., 0.9, 1.0
    label = (predicted_class / 10.0) - 1.0
    predictions.append(label)

# Check for predictive value distribution
predicted_label_distribution = pd.Series(predictions).value_counts()
print("Predicted label distribution:")
print(predicted_label_distribution)


Predicted label distribution:
 1.0    317
 0.0     79
-1.0     51
Name: count, dtype: int64


In [None]:
# Add the prediction result to the data frame
new_data['positivity'] = predictions


In [None]:
#Save filtered data
filtered_file_path = '/content/drive/MyDrive/Kwargs/esg 긍부정도/포스코 홀딩스_positivity_filtered.csv'
new_data.to_csv(filtered_file_path, index=False, encoding='utf-8-sig')

print(f"Filtered data saved to {filtered_file_path}")

Filtered data saved to /content/drive/MyDrive/Kwargs/esg 긍부정도/포스코 홀딩스_positivity_filtered.csv


In [2]:
import pandas as pd
import glob
import os

# Weight calculation function
def apply_weight_to_positivity(df):
# Text length calculation
    df['text_length'] = df['full_text'].apply(len)

# Max and maximum value of text length
    min_length = df['text_length'].min()
    max_length = df['text_length'].max()

# Weight calculation: Weigh the shortest article 0.6, the longest article to 1.0
    df['length_weight'] = df['text_length'].apply(lambda x: 0.6 + 0.4 * (x - min_length) / (max_length - min_length))

# Apply weight to positivity
    df['weighted_positivity'] = df['positivity'] * df['length_weight']

    df.drop(columns=['positivity', 'text_length', 'length_weight'], inplace=True)

    return df

# Set the original file path
input_path = "/content/drive/MyDrive/Kwargs/023. esg 긍부정도 라벨러 (모델A)/"
# Set the final storage folder path
output_path = "/content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/"

# If there is no storage path folder, it is created
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Find all files that fit the file pattern
files = glob.glob(os.path.join(input_path, "*_positivity_filtered.csv"))

# Apply weight to each file and store it
for file in files:
# CSV file load
    df = pd.read_csv(file)

# Strength application
    df = apply_weight_to_positivity(df)

# Set up a new file name (change folder)
    file_name = os.path.basename(file)
    new_file_path = os.path.join(output_path, file_name)

#Save the results as CSV in the new path
    df.to_csv(new_file_path, index=False)

    print(f"Processed and saved: {new_file_path}")


Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/셀트리온_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/삼성생명_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/현대차_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/신한지주_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/현대모비스_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/카카오_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/KB금융_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/NAVER_positivity_filtered.csv
Processed and saved: /content/drive/MyDrive/Kwargs/024. esg 긍부정도 라벨(가중치 적용)/삼성SDI_positivity_filtered.csv
Processed and