In [1]:
!pip install wandb
!pip install transformers
!pip install konlpy
!pip install mecab-python
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting wandb
  Downloading wandb-0.16.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.43.0-py2.py3-none-any.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AdamW
from tqdm import tqdm
import torch.nn as nn
from transformers import  ElectraForSequenceClassification, AutoTokenizer
import pandas as pd
import wandb
import MeCab

run = wandb.init(project='roberta')


def remove_single_characters(text):
    # 공백을 기준으로 텍스트를 단어로 분리
    words = text.split()
    # 1글자가 아닌 단어만 선택
    filtered_words = [word for word in words if len(word) > 1]
    # 다시 텍스트로 결합
    return ' '.join(filtered_words)


class ModelTrainer:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def train(self, train_loader, optimizer):
        self.model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            optimizer.zero_grad()
            outputs = self.model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            # 배치 손실을 W&B에 로깅
            wandb.log({"Batch Loss": loss.item()})

        avg_loss = total_loss / len(train_loader)
        # 에폭별 평균 손실을 W&B에 로깅
        wandb.log({"Average Training Loss": avg_loss})
        print(f"Average Training Loss: {avg_loss}")
    def evaluate(self, val_loader):
        self.model.eval()
        predictions, true_vals = [], []
        for batch in val_loader:
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            with torch.no_grad():
                outputs = self.model(**inputs)

            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = batch[2].cpu().numpy()
            predictions.append(np.argmax(logits, axis=1).flatten())
            true_vals.append(label_ids.flatten())

        predictions = np.concatenate(predictions)
        true_vals = np.concatenate(true_vals)
        return accuracy_score(true_vals, predictions)

    def run_training_loop(self, train_loader, val_loader, epochs=7):
        optimizer = AdamW(self.model.parameters(), lr=5e-5)

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            self.train(train_loader, optimizer)
            val_acc = self.evaluate(val_loader)
            print(f"Validation Accuracy: {val_acc}")
            wandb.log({"Validation Accuracy": val_acc})




class DataPreprocessor:
    def __init__(self, tokenizer,label_dict:dict = {'정치': 0, '사회': 1, '경제': 2, '국제': 3}):
        self.tokenizer = tokenizer
        self.label_dict = label_dict

    def encode_labels(self, df):
        df['label'] = df['label'].map(self.label_dict)
        return df

    def split_data(self, df):
        return train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    def preprocess_and_tokenize(self, texts):
        texts_filtered = texts.apply(remove_single_characters)  # 이전에 정의한 함수 사용
        encodings = self.tokenizer(texts_filtered.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
        return encodings

    def create_data_loader(self, df):
        df = self.encode_labels(df)
        X_train, X_val, y_train, y_val = self.split_data(df)

        train_encodings = self.preprocess_and_tokenize(X_train)
        val_encodings = self.preprocess_and_tokenize(X_val)

        train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
        val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val.values))

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=8)

        return train_loader, val_loader



# MeCab 토크나이저 초기화
mecab = MeCab.Tagger()

# MeCab 토크나이저 함수 정의
def mecab_tokenize(text):
    tokens = mecab.parse(text).split()
    return tokens

# 토크나이저 함수를 ElectraTokenizer에 적용하여 토크나이저 초기화

# 모델 초기화
model = ElectraForSequenceClassification.from_pretrained("klue/roberta-base")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base", tokenizer_func=mecab_tokenize)

# 모델에 토크나이저 설정
model.tokenizer = tokenizer

# 사전 훈련된 모델 로드
# classifier의 out_proj 레이어의 출력 차원을 4(레이블 수)로 조정

model.classifier.out_proj = nn.Linear(in_features=model.classifier.dense.out_features, out_features=4)

# 모델의 num_labels 속성 업데이트
model.num_labels = 4


# 사전 훈련된 모델 로드

label_dict = {'정치': 0, '사회': 1, '경제': 2, '국제': 3}


# CSV 파일 로드
df = pd.read_csv("/content/data_processed.csv", encoding="utf-8")
# NaN 값이나 None을 포함하는 행 제거
df.dropna(subset=['text', 'label'], inplace=True)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
preprocessor = DataPreprocessor(tokenizer,label_dict)
train_loader, val_loader = preprocessor.create_data_loader(df)
trainer = ModelTrainer(model, device)
trainer.run_training_loop(train_loader, val_loader,epochs=1)


# 모델 및 토크나이저 저장 경로
model_path = "roberta_custom_model.pt"
checkpoint_path = "roberta_checkpoint.pt"

# 모델 및 옵티마이저 상태 저장
torch.save(model.state_dict(), model_path)
torch.save(optimizer.state_dict(), checkpoint_path)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113550333333125, max=1.0…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

You are using a model of type roberta to instantiate a model of type electra. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'embeddings_project.bias', 'embeddings_project.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]



Epoch 1/1


 40%|███▉      | 76/192 [1:10:55<1:46:37, 55.15s/it]

In [None]:
import torch
from transformers import AdamW
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import MeCab

# MeCab 토크나이저 초기화
mecab = MeCab.Tagger()

# MeCab 토크나이저 함수 정의
def mecab_tokenize(text):
    tokens = mecab.parse(text).split()
    return tokens

# 모델 초기화
model = ElectraForSequenceClassification.from_pretrained("klue/roberta-base")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base", tokenizer_func=mecab_tokenize)

# 모델에 토크나이저 설정
model.tokenizer = tokenizer

# classifier의 out_proj 레이어의 출력 차원을 4(레이블 수)로 조정
model.classifier.out_proj = torch.nn.Linear(in_features=model.classifier.dense.out_features, out_features=4)

# 모델의 num_labels 속성 업데이트
model.num_labels = 4

# 데이터 전처리 클래스 정의
class DataPreprocessor:
    def __init__(self, tokenizer,label_dict:dict = {'정치': 0, '사회': 1, '경제': 2, '국제': 3}):
        self.tokenizer = tokenizer
        self.label_dict = label_dict

    def encode_labels(self, df):
        df['label'] = df['label'].map(self.label_dict)
        return df

    def split_data(self, df):
        return train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    def preprocess_and_tokenize(self, texts):
        texts_filtered = texts.apply(remove_single_characters)  # 이전에 정의한 함수 사용
        encodings = self.tokenizer(texts_filtered.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
        return encodings

    def create_data_loader(self, df):
        df = self.encode_labels(df)
        X_train, X_val, y_train, y_val = self.split_data(df)

        train_encodings = self.preprocess_and_tokenize(X_train)
        val_encodings = self.preprocess_and_tokenize(X_val)

        train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
        val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val.values))

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=8)

        return train_loader, val_loader

# 모델 학습 클래스 정의
class ModelTrainer:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def train(self, train_loader, optimizer):
        self.model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            optimizer.zero_grad()
            outputs = self.model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Average Training Loss: {avg_loss}")

    def evaluate(self, val_loader):
        self.model.eval()
        predictions, true_vals = [], []
        for batch in val_loader:
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            with torch.no_grad():
                outputs = self.model(**inputs)

            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = batch[2].cpu().numpy()
            predictions.append(np.argmax(logits, axis=1).flatten())
            true_vals.append(label_ids.flatten())

        predictions = np.concatenate(predictions)
        true_vals = np.concatenate(true_vals)
        return accuracy_score(true_vals, predictions)

    def run_training_loop(self, train_loader, val_loader, epochs=7):
        optimizer = AdamW(self.model.parameters(), lr=5e-5)

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            self.train(train_loader, optimizer)
            val_acc = self.evaluate(val_loader)
            print(f"Validation Accuracy: {val_acc}")

# 데이터 전처리 및 학습 준비
label_dict = {'정치': 0, '사회': 1, '경제': 2, '국제': 3}
df = pd.read_csv("/content/concat.csv", encoding="utf-8")
df.dropna(subset=['text', 'label'], inplace=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base", tokenizer_func=mecab_tokenize)
preprocessor = DataPreprocessor(tokenizer, label_dict)
train_loader, val_loader = preprocessor.create_data_loader(df)

# 모델 학습
model_trainer = ModelTrainer(model, device)
model_trainer.run_training_loop(train_loader, val_loader, epochs=5)

# 모델 및 토크나이저 저장
model_path = "roberta_custom_model.pt"
torch.save({
    'model_state_dict': model.state_dict(),
}, model_path)




In [None]:
# 모델 및 토크나이저 로드
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model_state_dict'])
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base", tokenizer_func=mecab_tokenize)
model.tokenizer = tokenizer

# 예측을 위한 함수 정의
def predict(text):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    for cls, idx in label_dict.items():
        if idx == predicted_class_idx:
            return cls

# 테스트
test_text = "예측할 텍스트를 입력하세요."
predicted_label = predict(test_text)
print("Predicted Label:", predicted_label)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

# CSV 파일 로드
df = pd.read_csv("/content/data_processed.csv")

# KoRoberta 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained("kykim/bert-kor-base")

# 레이블 매핑
label_map = {"정치": 0, "사회": 1, "국제": 2, "경제": 3}
df['label'] = df['label'].map(label_map)

# 훈련 데이터와 테스트 데이터로 분할
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# KoRoberta 입력 형식에 맞게 데이터를 인코딩
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

# KoRoberta 분류 모델 설정
model = RobertaForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels=len(label_map))

# 훈련 인자 설정
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# 모델 훈련
trainer.train()

# 모델 저장
model.save_pretrained("news_model_koroberta")

# 평가
predictions = trainer.predict(test_dataset)
predicted_classes = np.argmax(predictions.predictions, axis=1)
accuracy = accuracy_score(test_labels, predicted_classes)
print("Accuracy:", accuracy)
print(classification_report(test_labels, predicted_classes, target_names=label_map.keys()))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import wandb

# 로그인
wandb.login()

# wandb 초기화
wandb.init(project='news_classification')

# CSV 파일 로드
df = pd.read_csv("/content/data_processed.csv")

# KoRoberta 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained("kykim/bert-kor-base")

# 레이블 매핑
label_map = {"정치": 0, "사회": 1, "국제": 2, "경제": 3}
df['label'] = df['label'].map(label_map)

# 훈련 데이터와 테스트 데이터로 분할
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# KoRoberta 입력 형식에 맞게 데이터를 인코딩
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

# KoRoberta 분류 모델 설정
model = RobertaForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels=len(label_map))

# 훈련 인자 설정
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[wandb.log_metrics],
)

# 모델 훈련
trainer.train()

# 모델 저장
model.save_pretrained("news_model_koroberta")

# 평가
predictions = trainer.predict(test_dataset)
predicted_classes = np.argmax(predictions.predictions, axis=1)
accuracy = accuracy_score(test_labels, predicted_classes)
wandb.log({"accuracy": accuracy})
wandb.log({"classification_report": classification_report(test_labels, predicted_classes, target_names=label_map.keys())})
