# Text Classification

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
# 1. 데이터 로드 및 전처리
max_features = 10000  # 사용할 단어의 수 (빈도순)
maxlen = 200  # 패딩 후 시퀀스의 최대 길이

# IMDb 데이터셋 로드 (최대 max_features 단어만 사용)
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

# 시퀀스 패딩 (각 리뷰의 길이를 maxlen으로 맞춤)
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [3]:
# 2. 모델 생성
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))



In [4]:
# 3. 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 4. 모델 학습
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.7225 - loss: 0.5383 - val_accuracy: 0.7906 - val_loss: 0.4683
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 34ms/step - accuracy: 0.8537 - loss: 0.3473 - val_accuracy: 0.8757 - val_loss: 0.2945
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.9328 - loss: 0.1827 - val_accuracy: 0.8763 - val_loss: 0.3053
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.9632 - loss: 0.1077 - val_accuracy: 0.8683 - val_loss: 0.3715
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 41ms/step - accuracy: 0.9811 - loss: 0.0623 - val_accuracy: 0.8659 - val_loss: 0.4256


<keras.src.callbacks.history.History at 0x131b55ec200>

In [5]:
# 5. 모델 평가
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.8638 - loss: 0.4339
Test accuracy: 0.865880012512207


In [6]:
# 6. 모델 예측
# 예를 들어, 첫 번째 테스트 리뷰의 감정을 예측합니다.
predictions = model.predict(X_test[:1])
print(f"Prediction: {'Positive' if predictions[0] > 0.5 else 'Negative'}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
Prediction: Negative


In [7]:
# 새로운 리뷰 예측 함수 정의
word_index = imdb.get_word_index()

def preprocess_review(review):
    # 리뷰를 정수 인덱스의 시퀀스로 변환
    tokens = [word_index.get(word, 0) for word in review.lower().split()]
    tokens_padded = pad_sequences([tokens], maxlen=maxlen)
    return tokens_padded

def predict_sentiment(review):
    processed_review = preprocess_review(review)
    prediction = model.predict(processed_review)
    sentiment = 'Positive' if prediction[0] > 0.5 else 'Negative'
    return sentiment, prediction[0]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [9]:
# 새로운 리뷰 예측
new_review = "This movie was fantastic! The acting was incredible, and the story was thrilling."
sentiment, confidence = predict_sentiment(new_review)
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.4f})")

new_review2 = "I did not like this movie at all. The plot was boring and the characters were dull."
sentiment, confidence = predict_sentiment(new_review2)
print(f"Review: {new_review2}")
print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Review: This movie was fantastic! The acting was incredible, and the story was thrilling.


TypeError: unsupported format string passed to numpy.ndarray.__format__