In [None]:
# 전역 변수 설정
VOCAB_SIZE = 44851  # 단어 사전 크기
MAX_LEN = 447       # 패딩할 최대 문장 길이
EMBEDDING_DIM = 128

VECTORIZER_PATH = '/content/drive/MyDrive/ml_project/vectorizer_layer_model_1109.keras' # 토크나이저 저장 경로
PREPROCESSED_PATH = '/content/drive/MyDrive/ml_project/imdb_preprocessed.csv' # 전처리된 데이터 저장 경로


from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers

# 벡터라이저 로드 및 시퀀스 길이 확인
vectorizer = keras.models.load_model(VECTORIZER_PATH)
print("토크나이저 로드 완료")
# seq_len = vectorizer.output_shape[-1] # 447
# print(seq_len)

data = pd.read_csv(PREPROCESSED_PATH)
print("전처리된 데이터 로드 완료")

label_map = {'positive':1, 'negative':0}
y = data['sentiment'].map(label_map).astype('int32').values

# 텍스트 --> 정수 시퀀스 (N, 447)
X = vectorizer(tf.constant(data['review'].tolist()))
X = tf.cast(X, tf.int32)
print('X shape:', X.shape, 'y shape:', y.shape)# --> X shape: (50000, 447) y shape: (50000,)

vocab_size = VOCAB_SIZE
embedding_dim = EMBEDDING_DIM
seq_len = MAX_LEN

print("vocab_size:", vocab_size, "embedding_dim:", embedding_dim, "seq_len:", seq_len)

model = keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    layers.GlobalAveragePooling1D(),        # 문장 길이(200)를 하나로 요약
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.build(input_shape=(None, seq_len))
model.summary()


X_train, X_test, y_train, y_test = train_test_split(
    X.numpy(), y, test_size=0.2, random_state=42, stratify=y
)

BATCH = 256
train_ds = (tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(len(X_train))
            .batch(BATCH)
            .prefetch(tf.data.AUTOTUNE))

test_ds = (tf.data.Dataset.from_tensor_slices((X_test, y_test))
           .batch(BATCH)
           .prefetch(tf.data.AUTOTUNE))

model.fit(train_ds, epochs=5)

print(model.evaluate(test_ds))

MODEL_PATH = '/content/drive/MyDrive/ml_project/bl_model_1110.keras'
model.save(MODEL_PATH)
print("모델 저장 완료:", MODEL_PATH)

Mounted at /content/drive
토크나이저 로드 완료
전처리된 데이터 로드 완료
X shape: (50000, 447) y shape: (50000,)
vocab_size: 44851 embedding_dim: 128 seq_len: 447


Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 187ms/step - accuracy: 0.7329 - loss: 0.5919
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 186ms/step - accuracy: 0.9007 - loss: 0.2520
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 179ms/step - accuracy: 0.9388 - loss: 0.1759
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 179ms/step - accuracy: 0.9584 - loss: 0.1314
Epoch 5/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 177ms/step - accuracy: 0.9737 - loss: 0.0906
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - accuracy: 0.9018 - loss: 0.2781
[0.27710360288619995, 0.9027000069618225]
모델 저장 완료: /content/drive/MyDrive/ml_project/bl_model_1110.keras
