In [None]:
!pip install transformers
!pip install tensorflow

In [None]:
import os
import json
import re
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
pip install sentencepiece

In [None]:
from transformers import *

In [None]:
all_data=pd.read_csv("C:/Users/82102/Desktop/Baf_News_Category_Classification/summarized2.csv")
all_data

In [None]:
all_data.drop(['Unnamed: 0','text','length'],inplace=True,axis=1)
all_data

In [None]:
all_data = all_data.dropna()
all_data.shape

In [None]:
all_data['text_length'] = all_data['summarized_text'].apply(len)

# text_length 열에서 가장 큰 값을 찾아 출력
max_length = all_data['text_length'].max()

print("가장 긴 문자열 길이:", max_length)

In [None]:
tf.random.set_seed(111)
np.random.seed(111)

BATCH_SIZE=16
NUM_EPOCHS=8
VALID_SPLIT=0.2
MAX_LEN=128

In [None]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 생성
label_encoder = LabelEncoder()

# "category" 열의 라벨을 인코딩
all_data['label'] = label_encoder.fit_transform(all_data['target'])

In [None]:
from sklearn.model_selection import train_test_split

# 각 클래스별로 데이터 분류
class_data = {}
for target_class in all_data['target'].unique():
    class_data[target_class] = all_data[all_data['target'] == target_class]

# 각 클래스별로 80%는 훈련 데이터, 20%는 테스트 데이터로 분할
train_list = []
test_list = []

for target_class, data in class_data.items():
    train, test = train_test_split(data, test_size=0.15, random_state=42)
    train_list.append(train)
    test_list.append(test)

# 훈련 데이터와 테스트 데이터를 하나의 데이터프레임으로 병합
train_data = pd.concat(train_list)
test_data = pd.concat(test_list)

In [None]:
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [None]:
train_data=train_data.sample(frac=1)
train_data.reset_index(drop=True)
train_data

In [None]:
test_data=test_data.sample(frac=1)
test_data.reset_index(drop=True)
test_data

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir='bert_ckpt',do_lower_case=False)

def bert_tokenizer(sentence, MAX_LEN):

    encoded_dict=tokenizer.encode_plus(
      text = sentence,
      add_special_tokens=True,
      max_length=MAX_LEN,
      pad_to_max_length=True,
      return_attention_mask=True
  )

    input_id=encoded_dict['input_ids']
    attention_mask=encoded_dict['attention_mask']
    token_type_id=encoded_dict['token_type_ids']

      return input_id, attention_mask, token_type_id

In [None]:
input_ids=[]
attention_masks=[]
token_type_ids=[]
train_data_labels=[]

for train_sentence, train_label in tqdm(zip(train_data['summarized_text'], train_data['label']), total=len(train_data)):
      try:
    input_id, attention_mask, token_type_id = bert_tokenizer(train_sentence, MAX_LEN)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    train_data_labels.append(train_label)
      except Exception as e:
    print(e)
    pass

train_news_input_ids=np.array(input_ids, dtype=int)
train_news_attention_masks=np.array(attention_masks, dtype=int)
train_news_token_type_ids=np.array(token_type_ids, dtype=int)
train_news_inputs=(train_news_input_ids, train_news_attention_masks, train_news_token_type_ids)
train_data_labels=np.asarray(train_data_labels, dtype=np.int32)

In [None]:
input_id=train_news_input_ids[19]
attention_mask=train_news_attention_masks[1]
token_type_id=train_news_token_type_ids[1]
print(input_id)
print(attention_mask)
print(token_type_id)
print(tokenizer.decode(input_id))

In [None]:
print(tokenizer.decode(train_news_input_ids[6]))

In [None]:
class TFBertClassifier(tf.keras.Model):
      def __init__(self, model_name, dir_path, num_class):
    super(TFBertClassifier, self).__init__()

    self.bert=TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
    self.dropout=tf.keras.layers.Dropout(0.2) # 데이터 양이 적어서 0.5로 지정 #self.bert.config_hidden_dropout_prob
    self.classifier=tf.keras.layers.Dense(num_class,
                                          kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                          name='classifier')

      def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
    outputs=self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
    pooled_output=outputs[1]
    pooled_output=self.dropout(pooled_output, training=training)
    logits=self.classifier(pooled_output)

    return logits

cls_model=TFBertClassifier(model_name='bert-base-multilingual-cased',
                           dir_path='bert_ckpt',
                           num_class=8)

In [None]:
optimizer=tf.keras.optimizers.Adam(2e-6)
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric=tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
history=cls_model.fit(train_news_inputs, train_data_labels,
                      epochs=20, batch_size=BATCH_SIZE, validation_split=VALID_SPLIT)

print(history.history)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'], '')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['loss','val_loss'])
plt.show()

In [None]:
input_ids=[]
attention_masks=[]
token_type_ids=[]
test_data_labels=[]

for test_sentence, test_label in tqdm(zip(test_data['summarized_text'], test_data['label'])):
      try:
    input_id, attention_mask, token_type_id = bert_tokenizer(test_sentence, MAX_LEN)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    test_data_labels.append(test_label)
      except Exception as e:
    print(e)
    pass

test_news_input_ids=np.array(input_ids, dtype=int)
test_news_attention_masks=np.array(attention_masks, dtype=int)
test_news_token_type_ids=np.array(token_type_ids, dtype=int)
test_news_inputs=(test_news_input_ids, test_news_attention_masks, test_news_token_type_ids)
test_data_labels=np.asarray(test_data_labels, dtype=np.int32)

In [None]:
cls_model.evaluate(test_news_inputs, test_data_labels, batch_size=16)