# 라이브러리 다운

In [8]:
from transformers import RobertaTokenizer, RobertaModel
from transformers import RobertaForSequenceClassification
import pandas as pd
from datasets import Dataset
import torch
from transformers import DataCollatorWithPadding, Trainer

In [9]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.text = df['Title'].values
        self.labels = df['Genre'].values
        self.max_length = max_length
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.labels)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



In [3]:
df = pd.read_csv('vector_data.csv',encoding = "ISO-8859-1")
data = df[['Title', 'Genre']]

data.to_csv('vetor_data_1.csv',index = False, encoding = "ISO-8859-1")

In [None]:
import matplotlib.pyplot as plt

plt.hist(data['Genre'], bins = 100)

In [None]:
num = len(data)
train_df = data.iloc[:int(num*0.75), :]
eval_df = data.iloc[int(num*0.75):, :]

train_dataset = MyDataset(
    train_df,
    tokenizer,
    max_length=128
)

eval_dataset = MyDataset(
    eval_df,
    tokenizer,
    max_length=128
)



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    run_name="ProBert-BFD-MS",       # experiment name
    seed=3      )

In [None]:
# from sklearn.metrics import accuracy_score

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return {'accuracy': accuracy_score(labels, predictions)}

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
# )


In [None]:
# trainer.train()

In [None]:
from torch.utils.data import DataLoader

class MyTrainer(Trainer):
    def get_train_dataloader(self):
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        data_loader = DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
        )
        return data_loader

    def get_eval_dataloader(self, eval_dataset=None):
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        data_loader = DataLoader(
            eval_dataset,
            batch_size=self.args.eval_batch_size,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
        )
        return data_loader


In [None]:
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()




ValueError: ignored

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Title,Genre
0,0,Doug the Pug 2016 Wall Calendar,3
1,1,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",3
2,2,365 Cats Color Page-A-Day Calendar 2016,3
3,3,Sierra Club Engagement Calendar 2016,3
4,4,Sierra Club Wilderness Calendar 2016,3


In [None]:
!pip install datasets
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from transformers import AutoTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

# Read the dataset
df = pd.read_csv('/content/vetor_data_1.csv', encoding="ISO-8859-1")

# Genre 데이터의 범주를 정수 인덱스로 변환
label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Convert the DataFrame into a Hugging Face Dataset
data = Dataset.from_pandas(df)

# Tokenizer initialization
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["Title"], padding="max_length", truncation=True)

# Split dataset into train and test
raw_datasets = data.train_test_split(test_size=0.25)

# Map the tokenization function to the title text (it will also remove the column 'Title' and replace it with the new ones)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=["Title"])

# Model initialization
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=32)

# Training arguments
training_args = TrainingArguments(
    "test_trainer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Override the compute_loss method of the Trainer
def compute_loss(model, inputs):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits, labels)
    return loss

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,

)
trainer.compute_loss = compute_loss
trainer.train()


Map:   0%|          | 0/155678 [00:00<?, ? examples/s]

Map:   0%|          | 0/51893 [00:00<?, ? examples/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should pr

KeyError: ignored

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch

# csv 파일 읽기
df = pd.read_csv('vetor_data_1.csv', encoding="ISO-8859-1")

# Title을 tokenizing
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
df['Title'] = df['Title'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))


# 32개의 Genre를 binary encoding
mlb = MultiLabelBinarizer()
df['Genre'] = df['Genre'].apply(lambda x: mlb.fit_transform([x]))


# 데이터셋을 train과 eval로 나누기
train_df, eval_df = train_test_split(df, test_size=0.2)

# PyTorch의 Dataset을 만들기 위한 클래스 정의
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.encodings = df['Title'].to_list()
        self.labels = df['Genre'].to_list()

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}
        return item


# Dataset 만들기
train_dataset = Dataset(train_df)
eval_dataset = Dataset(eval_df)

# 모델 초기화
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=32)

# TrainingArguments와 Trainer 초기화
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# 학습 시작
trainer.train()

# 임의의 데이터에 대한 예측
def predict_genre(title):
    inputs = tokenizer(title, return_tensors='pt')
    logits = model(**inputs).logits
    result = torch.sigmoid(logits)
    return mlb.inverse_transform(result.detach().numpy())


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

ValueError: all input arrays must have the same shape

# retry

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('vetor_data_1.csv', encoding="ISO-8859-1")

In [4]:
from sklearn.datasets import load_iris # 샘플 데이터 로딩
from sklearn.model_selection import train_test_split

# load sample

data = df['Title']
target = df['Genre']

# train_test_split
x_train_1, x_valid_1, y_train_1, y_valid_1 = train_test_split(data, target, test_size=0.1, shuffle=True, stratify=target, random_state=34)

# BERT

In [5]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 데이터 로드 및 토큰화
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 만약 x_train과 x_valid가 pandas Series 객체라면, 이를 list로 변환
x_train = x_train_1.tolist()
x_valid = x_valid_1.tolist()

# y_train과 y_valid가 텍스트 라벨이라면, 이를 정수형 라벨로 변환
le = LabelEncoder()
y_train = le.fit_transform(y_train_1)
y_valid = le.transform(y_valid_1)

import pickle

# LabelEncoder 객체를 저장합니다.
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# 나중에 LabelEncoder 객체를 로드합니다.
with open('label_encoder.pkl', 'rb') as f:
    loaded_le = pickle.load(f)

train_encodings = tokenizer(x_train, truncation=True, padding=True)
valid_encodings = tokenizer(x_valid, truncation=True, padding=True)

# 데이터셋 생성
train_dataset = CustomDataset(train_encodings, y_train)
valid_dataset = CustomDataset(valid_encodings, y_valid)

# 모델 생성
num_labels = len(set(y_train))  # 클래스 수를 정의합니다.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# 학습 및 검증 인자 정의
training_args = TrainingArguments(
    output_dir='./results',          # 출력 결과를 저장할 경로
    num_train_epochs=3,              # 훈련 에포크 수
    save_total_limit = 3,
    per_device_train_batch_size=32,  # 훈련 배치 크기
    per_device_eval_batch_size=32,   # 검증 배치 크기
    warmup_steps=500,                # 워밍업 스텝 수
    weight_decay=0.01,         
    learning_rate = 3e-05,# 가중치 감쇠 비율
    logging_dir='./logs',            # 로그를 저장할 경로
    logging_steps=500,                # 로깅할 스텝 크기
    evaluation_strategy='steps',     # 스텝마다 evaluation을 수행합니다.
    eval_steps=500,                  # 500스텝마다 evaluation을 수행합니다.
)

# Trainer 생성 및 학습
trainer = Trainer(
    model=model,                         # 학습시킬 모델
    args=training_args,                  # 학습 인자
    train_dataset=train_dataset,         # 훈련 데이터셋
    eval_dataset=valid_dataset,          # 검증 데이터셋
    compute_metrics=compute_metrics      # 평가 메트릭 계산 함수
)

# 모델 학습
trainer.train()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33myeppi315[0m ([33mgyubin5009[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Accuracy
500,2.7342,1.860826,0.515416
1000,1.6612,1.507053,0.590712
1500,1.4473,1.377345,0.624964
2000,1.3891,1.305195,0.640042
2500,1.335,1.252421,0.653435
3000,1.255,1.233388,0.653531
3500,1.2549,1.21849,0.656952
4000,1.2273,1.183047,0.668513
4500,1.2137,1.165758,0.67415
5000,1.1886,1.160745,0.676414


TrainOutput(global_step=17514, training_loss=1.0179641962568686, metrics={'train_runtime': 5502.0017, 'train_samples_per_second': 101.861, 'train_steps_per_second': 3.183, 'total_flos': 3.7450515692304e+16, 'train_loss': 1.0179641962568686, 'epoch': 3.0})

In [6]:
# 모델 저장
trainer.save_model("./model/save_last")


In [18]:
with open('label_encoder_large.pkl', 'rb') as f:
    le = pickle.load(f)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('./last_model', num_labels=32)


def predict_genre(text, model, tokenizer, label_encoder, k=3):
    # 디바이스 설정
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    # 모델을 올바른 디바이스로 이동
    model.to(device)

    # 텍스트를 토큰화합니다
    inputs = tokenizer([text], return_tensors='pt', padding=True, truncation=True).to(device)

    # 예측을 수행합니다
    outputs = model(**inputs)

    # 가장 높은 확률을 가진 클래스를 찾습니다
    topk_values, topk_indices = torch.topk(outputs.logits, k, dim=-1)

    # 클래스를 원래의 텍스트 라벨로 변환합니다
    predicted_labels = label_encoder.inverse_transform(topk_indices.cpu().numpy()[0])
    print(predicted_labels)
    return predicted_labels  # 결과가 리스트인데 첫 번째 요소만 반환합니다.

# 예제 텍스트
text = "He died yesterday"

# 텍스트의 장르 예측
predicted_genres = predict_genre(text, model, tokenizer, le, k=1)  # 'le'를 사용하셔야 합니다.
print(predicted_genres)


['Literature & Fiction']
['Literature & Fiction']


In [7]:
!sudo apt-get install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: sudo: command not found


In [14]:
!git lfs install
!git clone https://huggingface.co/gyubinc/bert-book32-gyubin

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Cloning into 'bert-book32-gyubin'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


In [15]:
!git commit -m "new_reposi"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
	[31mmodified:   classi.ipynb[m

Untracked files:
	[31m__pycache__/[m
	[31mbert-book32-gyubin/[m
	[31mbert.py[m
	[31mroberta.py[m
	[31mvetor_data_1.csv[m
	[31mwandb/[m
	[31m../mode.py[m
	[31m../summary/__pycache__/[m

no changes added to commit


In [6]:
import pickle
from transformers import BertTokenizerFast, BertForSequenceClassification

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('last_model', num_labels=32)

In [8]:
from huggingface_hub import login
access_token_write = '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'
login(token = access_token_write)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/ml/.cache/huggingface/token
Login successful


In [12]:
# repo
from huggingface_hub import login
access_token_write = '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'
login(token = access_token_write)

MODEL_SAVE_REPO = 'gyubinc/bert-book32-gyubin'
HUGGINGFACE_AUTH_TOKEN = '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' 

# Push to huggingface-hub
model.push_to_hub(
MODEL_SAVE_REPO,
use_temp_dir=False,
use_auth_token=HUGGINGFACE_AUTH_TOKEN
)

tokenizer.push_to_hub(
MODEL_SAVE_REPO,
use_temp_dir=False,
use_auth_token=HUGGINGFACE_AUTH_TOKEN
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/ml/.cache/huggingface/token
Login successful


FileNotFoundError: [Errno 2] No such file or directory: 'bert-book32-gyubin'

# ROBERTA

In [1]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 데이터 로드 및 토큰화
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# 만약 x_train과 x_valid가 pandas Series 객체라면, 이를 list로 변환
x_train = x_train_1.tolist()
x_valid = x_valid_1.tolist()

print(y_train_1)

# y_train과 y_valid가 텍스트 라벨이라면, 이를 정수형 라벨로 변환
le = LabelEncoder()
y_train = le.fit_transform(y_train_1)
y_valid = le.transform(y_valid_1)

print(y_train)

import pickle

# LabelEncoder 객체를 저장합니다.
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# 나중에 LabelEncoder 객체를 로드합니다.
with open('label_encoder.pkl', 'rb') as f:
    loaded_le = pickle.load(f)


train_encodings = tokenizer(x_train, truncation=True, padding=True)
valid_encodings = tokenizer(x_valid, truncation=True, padding=True)

# 데이터셋 생성
train_dataset = CustomDataset(train_encodings, y_train)
valid_dataset = CustomDataset(valid_encodings, y_valid)

# 모델 생성
num_labels = len(set(y_train))  # 클래스 수를 정의합니다.
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

# 학습 및 검증 인자 정의
training_args = TrainingArguments(
    output_dir='./results',          # 출력 결과를 저장할 경로
    num_train_epochs=3,       
    save_total_limit = 3,# 훈련 에포크 수
    per_device_train_batch_size=64,  # 훈련 배치 크기
    per_device_eval_batch_size=64,
    learning_rate = 5e-05,# 검증 배치 크기
    warmup_steps=500,                # 워밍업 스텝 수
    weight_decay=0.01,               # 가중치 감쇠 비율
    logging_dir='./logs',            # 로그를 저장할 경로
    logging_steps=10,                # 로깅할 스텝 크기
    evaluation_strategy='steps',     # 스텝마다 evaluation을 수행합니다.
    eval_steps=500,                  # 500스텝마다 evaluation을 수행합니다.
)

# Trainer 생성 및 학습
trainer = Trainer(
    model=model,                         # 학습시킬 모델
    args=training_args,                  # 학습 인자
    train_dataset=train_dataset,         # 훈련 데이터셋
    eval_dataset=valid_dataset,          # 검증 데이터셋
    compute_metrics=compute_metrics      # 평가 메트릭 계산 함수
)

# 모델 학습
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'x_train_1' is not defined

# Roberta-large

In [11]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 데이터 로드 및 토큰화
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')

# 만약 x_train과 x_valid가 pandas Series 객체라면, 이를 list로 변환
x_train = x_train_1.tolist()
x_valid = x_valid_1.tolist()

print(y_train_1)

# y_train과 y_valid가 텍스트 라벨이라면, 이를 정수형 라벨로 변환
le = LabelEncoder()
y_train = le.fit_transform(y_train_1)
y_valid = le.transform(y_valid_1)

print(y_train)

import pickle

# LabelEncoder 객체를 저장합니다.
with open('label_encoder_large.pkl', 'wb') as f:
    pickle.dump(le, f)

# 나중에 LabelEncoder 객체를 로드합니다.
with open('label_encoder_large.pkl', 'rb') as f:
    loaded_le = pickle.load(f)


train_encodings = tokenizer(x_train, truncation=True, padding=True)
valid_encodings = tokenizer(x_valid, truncation=True, padding=True)

# 데이터셋 생성
train_dataset = CustomDataset(train_encodings, y_train)
valid_dataset = CustomDataset(valid_encodings, y_valid)

# 모델 생성
num_labels = len(set(y_train))  # 클래스 수를 정의합니다.
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=num_labels)

# 학습 및 검증 인자 정의
training_args = TrainingArguments(
    output_dir='./results/roberta-large',          # 출력 결과를 저장할 경로
    save_strategy = "epoch",
    save_total_limit = 3,
    num_train_epochs=3,              # 훈련 에포크 수
    per_device_train_batch_size=8,  # 훈련 배치 크기
    per_device_eval_batch_size=8,
    learning_rate = 1e-05,# 검증 배치 크기
    warmup_steps=500,                # 워밍업 스텝 수
    weight_decay=0.01,               # 가중치 감쇠 비율
    logging_dir='./logs',            # 로그를 저장할 경로
    logging_steps=1000,                # 로깅할 스텝 크기
    evaluation_strategy='steps',     # 스텝마다 evaluation을 수행합니다.
    eval_steps=1000,                  # 500스텝마다 evaluation을 수행합니다.
)

# Trainer 생성 및 학습
trainer = Trainer(
    model=model,                         # 학습시킬 모델
    args=training_args,                  # 학습 인자
    train_dataset=train_dataset,         # 훈련 데이터셋
    eval_dataset=valid_dataset,          # 검증 데이터셋
    compute_metrics=compute_metrics      # 평가 메트릭 계산 함수
)

# 모델 학습
trainer.train()


182399           Children's Books
170154         Teen & Young Adult
4138      Comics & Graphic Novels
47472      Crafts, Hobbies & Home
185004           Children's Books
                   ...           
183203           Children's Books
46799      Crafts, Hobbies & Home
203282                     Travel
157723    Religion & Spirituality
48450      Crafts, Hobbies & Home
Name: Genre, Length: 186813, dtype: object
[ 4 29  6 ... 31 23  9]


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33myeppi315[0m ([33mgyubin5009[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Accuracy
1000,2.5098,1.670472,0.553377
2000,1.6503,1.504154,0.594325
3000,1.5215,1.390908,0.621929
4000,1.4531,1.362344,0.624819
5000,1.3981,1.362916,0.626072
6000,1.3642,1.307148,0.637971
7000,1.3789,1.282162,0.641969
8000,1.332,1.270696,0.649822
9000,1.3204,1.250407,0.653917
10000,1.31,1.235423,0.655554


KeyboardInterrupt: 

In [1]:
def predict_genre(text, model, tokenizer, label_encoder):
    # 텍스트를 토큰화합니다
    inputs = tokenizer([text], return_tensors='pt', padding=True, truncation=True)

    # GPU를 사용할 수 있으면 GPU로 이동합니다
    if torch.cuda.is_available():
        inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}

    # 예측을 수행합니다
    outputs = model(**inputs)

    # 가장 높은 확률을 가진 클래스를 찾습니다
    predicted_class = torch.argmax(outputs.logits, dim=-1)

    # 클래스를 원래의 텍스트 라벨로 변환합니다
    predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())
    print(predicted_label)
    return predicted_label[0]  # 결과가 리스트인데 첫 번째 요소만 반환합니다.

# 예제 텍스트
text = "This is a new text."

# 텍스트의 장르 예측
predicted_genre = predict_genre(text, model, tokenizer, le)  # 'le'를 사용하셔야 합니다.
print(predicted_genre)


NameError: name 'model' is not defined

In [45]:
def predict_genre(text, model, tokenizer, label_encoder):
    # 텍스트를 토큰화합니다
    inputs = tokenizer([text], return_tensors='pt', padding=True, truncation=True)

    # GPU를 사용할 수 있으면 GPU로 이동합니다
    if torch.cuda.is_available():
        inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}

    # 예측을 수행합니다
    outputs = model(**inputs)

    # 가장 높은 확률을 가진 클래스를 찾습니다
    predicted_class = torch.argmax(outputs.logits, dim=-1)

    # 클래스를 원래의 텍스트 라벨로 변환합니다
    predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())

    # 결과가 숫자로 나오는 경우에 대한 디버깅
    if isinstance(predicted_label[0], (int, np.integer)):
        print(f"Warning: Expected string label but got integer label. Please check the fit of the label encoder. First few classes: {le.classes_[:5]}")
        
    return predicted_label[0]  # 결과가 리스트인데 첫 번째 요소만 반환합니다.

# 예제 텍스트
text = "This is a new text."

# 텍스트의 장르 예측
predicted_genre = predict_genre(text, model, tokenizer, le)  # 'le'를 사용하셔야 합니다.
print(predicted_genre)


0


# distilbert

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 정확도를 계산하는 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 데이터 로드 및 토큰화
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 만약 x_train과 x_valid가 pandas Series 객체라면, 이를 list로 변환
x_train = x_train_1.tolist()
x_valid = x_valid_1.tolist()

print(y_train_1)

# y_train과 y_valid가 텍스트 라벨이라면, 이를 정수형 라벨로 변환
le = LabelEncoder()
y_train = le.fit_transform(y_train_1)
y_valid = le.transform(y_valid_1)

print(y_train)

import pickle

# LabelEncoder 객체를 저장합니다.
with open('label_encoder_distil_2.pkl', 'wb') as f:
    pickle.dump(le, f)

# 나중에 LabelEncoder 객체를 로드합니다.
with open('label_encoder_distil_2.pkl', 'rb') as f:
    loaded_le = pickle.load(f)


train_encodings = tokenizer(x_train, truncation=True, padding=True)
valid_encodings = tokenizer(x_valid, truncation=True, padding=True)

# 데이터셋 생성
train_dataset = CustomDataset(train_encodings, y_train)
valid_dataset = CustomDataset(valid_encodings, y_valid)

# 모델 생성
num_labels = len(set(y_train))  # 클래스 수를 정의합니다.

print(len(set(y_train)))
print(len(set(y_valid)))

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=32)


# 학습 및 검증 인자 정의
training_args = TrainingArguments(
    output_dir='./results/distil/2',          # 출력 결과를 저장할 경로
    save_strategy = "epoch",
    save_total_limit = 3,
    num_train_epochs=6,              # 훈련 에포크 수
    per_device_train_batch_size=16,  # 훈련 배치 크기
    per_device_eval_batch_size=16,
    learning_rate = 1e-05,# 검증 배치 크기
    warmup_steps=500,                # 워밍업 스텝 수
    weight_decay=0.01,               # 가중치 감쇠 비율
    logging_dir='./logs',            # 로그를 저장할 경로
    logging_steps=1000,                # 로깅할 스텝 크기
    evaluation_strategy='steps',     # 스텝마다 evaluation을 수행합니다.
    eval_steps=1000,                  # 500스텝마다 evaluation을 수행합니다.
)

# Trainer 생성 및 학습
trainer = Trainer(
    model=model,                         # 학습시킬 모델
    args=training_args,                  # 학습 인자
    train_dataset=train_dataset,         # 훈련 데이터셋
    eval_dataset=valid_dataset,          # 검증 데이터셋
    compute_metrics=compute_metrics      # 평가 메트릭 계산 함수
)

# 모델 학습
trainer.train()


In [5]:
# 모델 저장
trainer.save_model("./model/distil_0.694")


In [10]:
with open('label_encoder_large.pkl', 'rb') as f:
    le = pickle.load(f)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('./results/roberta-large/checkpoint-70056', num_labels=32)

In [24]:
def predict_genre(text, model, tokenizer, label_encoder, k=3):
    # 디바이스 설정
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    # 모델을 올바른 디바이스로 이동
    model.to(device)

    # 텍스트를 토큰화합니다
    inputs = tokenizer([text], return_tensors='pt', padding=True, truncation=True).to(device)

    # 예측을 수행합니다
    outputs = model(**inputs)

    # 가장 높은 확률을 가진 클래스를 찾습니다
    topk_values, topk_indices = torch.topk(outputs.logits, k, dim=-1)

    # 클래스를 원래의 텍스트 라벨로 변환합니다
    predicted_labels = label_encoder.inverse_transform(topk_indices.cpu().numpy()[0])
    print(predicted_labels)
    return predicted_labels  # 결과가 리스트인데 첫 번째 요소만 반환합니다.

# 예제 텍스트
text = "He died yesterday"

# 텍스트의 장르 예측
predicted_genres = predict_genre(text, model, tokenizer, le, k=3)  # 'le'를 사용하셔야 합니다.
print(predicted_genres)


['Biographies & Memoirs' 'Literature & Fiction' 'Teen & Young Adult']
['Biographies & Memoirs' 'Literature & Fiction' 'Teen & Young Adult']


In [38]:
# 예제 텍스트
texts = ["This is a new text.", "Here's another one."]

# 텍스트의 장르 예측
predicted_genres = predict_genre(texts, model, tokenizer)
print(predicted_genres)

[15 15]


In [57]:
encoded_dict = {}
for index, genre in enumerate(list(df['Genre'].unique())):
    encoded_dict[genre] = int(index)



In [58]:
df_train = pd.concat([x_train, y_train], axis = 1)
df_test = pd.concat([x_train, y_train], axis = 1 )

In [59]:
df_train

Unnamed: 0,Title,Genre
130368,Calvin Coolidge,Biographies & Memoirs
146910,The Good Shepherd: A Thousand-Year Journey fro...,Christian Books & Bibles
173744,Daoism (World Religions (Facts on File)),Teen & Young Adult
188047,QB 1,Children's Books
93154,"Storm Surge: Hurricane Sandy, Our Changing Cli...",Science & Math
...,...,...
99805,The Husband's Field Guide: Navigating Your Wif...,"Health, Fitness & Dieting"
186882,Tickle Monster,Children's Books
62700,How to Find Scholarships and Free Financial Ai...,Education & Teaching
67598,The Secrets of Making Love Happen: How to Find...,Self-Help


In [12]:
print(encoded_dict)

{'Calendars': 0, 'Comics & Graphic Novels': 1, 'Test Preparation': 2, 'Mystery, Thriller & Suspense': 3, 'Science Fiction & Fantasy': 4, 'Romance': 5, 'Humor & Entertainment': 6, 'Literature & Fiction': 7, 'Gay & Lesbian': 8, 'Engineering & Transportation': 9, 'Cookbooks, Food & Wine': 10, 'Crafts, Hobbies & Home': 11, 'Arts & Photography': 12, 'Education & Teaching': 13, 'Parenting & Relationships': 14, 'Self-Help': 15, 'Computers & Technology': 16, 'Medical Books': 17, 'Science & Math': 18, 'Health, Fitness & Dieting': 19, 'Business & Money': 20, 'Law': 21, 'Biographies & Memoirs': 22, 'History': 23, 'Politics & Social Sciences': 24, 'Reference': 25, 'Christian Books & Bibles': 26, 'Religion & Spirituality': 27, 'Sports & Outdoors': 28, 'Teen & Young Adult': 29, "Children's Books": 30, 'Travel': 31}


In [13]:
df_train['Genre'] = df_train.Genre.map(encoded_dict)
df_test['Genre'] = df_test.Genre.map(encoded_dict)

In [14]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(df_train.Genre)
y_test = to_categorical(df_test.Genre)

In [35]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")

Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:04<00:00, 103MB/s]  
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
from transformers import BertTokenizer, TFAutoModelForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
x_train = tokenizer(
    text=df_train.Title.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test = tokenizer(
    text=df_test.Title.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [43]:
x_train = tokenizer(text=df_train.Title.tolist(), return_tensors='pt' ,truncation=True,
    padding=True, )
x_test = tokenizer(text=df_test.Title.tolist(), return_tensors='pt', truncation=True,
    padding=True, )

In [16]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

In [17]:
x_test['input_ids']

<tf.Tensor: shape=(166056, 70), dtype=int32, numpy=
array([[  101, 11110, 13297, ...,     0,     0,     0],
       [  101,  1109,  2750, ...,     0,     0,     0],
       [  101, 10136,  8586, ...,     0,     0,     0],
       ...,
       [  101,  1731,  1106, ...,     0,     0,     0],
       [  101,  1109, 19958, ...,     0,     0,     0],
       [  101,  6008,  9326, ...,     0,     0,     0]], dtype=int32)>

In [34]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense


max_len = 70
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = model(input_ids,attention_mask = input_mask)[0]
# embeddings = bert(input_ids,attention_mask = input_mask)[0]
#out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(embeddings)
#out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(32,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [35]:
embeddings

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'tf_bert_for_sequence_classification_1')>

In [36]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [47]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('CUDA : 0')))


Num GPUs Available:  0


In [48]:
import tensorflow as tf

# 사용 가능한 GPU의 수를 확인
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# TensorFlow가 현재 GPU를 사용하고 있는지 확인
print("Is TensorFlow built with CUDA: ", tf.test.is_built_with_cuda())
print("Is GPU available: ", tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))


Num GPUs Available:  0
Is TensorFlow built with CUDA:  True
Is GPU available:  False


2023-07-18 15:01:52.129080: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-07-18 15:01:52.130226: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [50]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
    ),
  epochs=1,
    batch_size=36
)

 295/4613 [>.............................] - ETA: 5:14:59 - loss: 3.2835 - balanced_accuracy: 0.0845

KeyboardInterrupt: 

In [None]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
predicted_raw[0]

In [5]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer

class BertClassifier(LightningModule):
    def __init__(self, num_classes: int):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = self.criterion(outputs.logits, labels)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = self.criterion(outputs.logits, labels)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5)

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 데이터 로드 및 BERT tokenizer 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data = x_train  # 여기에 당신의 데이터를 로드해주세요. 
labels = y_train  # 여기에 당신의 라벨을 로드해주세요.
encodings = tokenizer(data, truncation=True, padding=True)

# PyTorch dataset 및 dataloader 생성
dataset = TextDataset(encodings, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


val_data = x_valid  # 여기에 당신의 검증 데이터를 로드해주세요. 
val_labels = y_valid  # 여기에 당신의 검증 라벨을 로드해주세요.
val_encodings = tokenizer(val_data, truncation=True, padding=True)

# PyTorch dataset 및 dataloader 생성
val_dataset = TextDataset(val_encodings, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 모델 초기화 및 훈련
num_classes = 32  # 클래스의 수를 정의해주세요
model = BertClassifier(num_classes=num_classes)
trainer = Trainer(max_epochs=3, gpus=1)  # GPU를 사용하여 3 에포크동안 모델을 훈련
trainer.fit(model, dataloader, val_dataloader)


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'pytorch_lightning'

In [4]:
import pytorch_lightning as pl

ModuleNotFoundError: No module named 'pytorch_lightning'

In [2]:
pip install --ignore-installed PyYAML

Collecting PyYAML
  Using cached PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
Installing collected packages: PyYAML
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tts 0.0.14.1 requires numpy==1.18.5, but you have numpy 1.24.4 which is incompatible.[0m[31m
[0mSuccessfully installed PyYAML-5.3.1
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install pytorch-lightning

Collecting pytorch-lightning
  Using cached pytorch_lightning-2.0.5-py3-none-any.whl (722 kB)
Collecting PyYAML>=5.4 (from pytorch-lightning)
  Using cached PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
Collecting lightning-utilities>=0.7.0 (from pytorch-lightning)
  Using cached lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: PyYAML, lightning-utilities, pytorch-lightning
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 5.3.1
[31mERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# movie genre

In [None]:
from transformers import RobertaTokenizer, RobertaModel
from transformers import RobertaForSequenceClassification
import pandas as pd
from datasets import Dataset
import torch
from transformers import DataCollatorWithPadding, Trainer

In [None]:
df = pd.read_csv('medium_post_titles.csv')

In [16]:
df.head(5)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False


In [None]:
df.drop(df[df.subtitle.isna()].index, inplace=True) #Rows with missing values are cleared
len(df)