In [23]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [24]:
# IMDB 데이터셋 로드
(X_train_raw, y_train_raw), (X_test_raw, y_test_raw) = imdb.load_data()

print(f'훈련용 리뷰 개수: {len(X_train_raw)}')
print(f'테스트용 리뷰 개수: {len(X_test_raw)}')

num_classes = len(set(y_train_raw))
print(f'카테고리(긍정/부정): {num_classes}')

훈련용 리뷰 개수: 25000
테스트용 리뷰 개수: 25000
카테고리(긍정/부정): 2


In [27]:
# IMDB 데이터셋의 워드 인덱스 확인
word_index = imdb.get_word_index()
print(f"단어 개수: {len(word_index)}")

단어 개수: 88584


In [31]:
# 단어 인덱스 맵에 3을 더해 실제 단어 ID를 매핑
index_to_word = {i + 3: word for word, i in word_index.items()}
index_to_word[0], index_to_word[1], index_to_word[2] = "<PAD>", "<START>", "<UNK>"

# 인덱스 데이터를 텍스트로 변환하는 함수
def decode_review(review):
    return ' '.join([index_to_word.get(i, "<UNK>") for i in review])

In [33]:
# 샘플 데이터 출력
print("샘플 리뷰(인덱스):", X_train_raw[0])
print("샘플 리뷰(텍스트):", decode_review(X_train_raw[0]))
print("샘플 레이블:", y_train_raw[0])

샘플 리뷰(인덱스): [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
샘플 리뷰(텍스트): <START> this

In [35]:
# 인덱스를 텍스트로 변환한 후 TF-IDF 적용
X_train_text = [decode_review(review) for review in X_train_raw]
X_test_text = [decode_review(review) for review in X_test_raw]

In [43]:
# TF-IDF 벡터화
vectorizer = TfidfVectorizer(
    max_features=5000,  # 상위 5000개의 단어만 사용
    stop_words='english',  # 영어 불용어 제거
    lowercase=True  # 소문자변환
)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

In [48]:
# XGBoost 모델 초기화
xgb_model = xgb.XGBClassifier(
    eval_metric='logloss',  # 로그손실 사용
    max_depth=6,
    n_estimators=100,
    learning_rate=0.1
)

In [49]:
# 모델 학습
xgb_model.fit(X_train_tfidf, y_train_raw)

In [52]:
# 예측 및 평가
y_pred = xgb_model.predict(X_test_tfidf)
print(f"정확도(Accuracy): {accuracy_score(y_test_raw, y_pred)}")
print("\n분류 보고서(Classification Report):")
print(classification_report(y_test_raw, y_pred))

정확도(Accuracy): 0.82716

분류 보고서(Classification Report):
              precision    recall  f1-score   support

           0       0.86      0.79      0.82     12500
           1       0.80      0.87      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [54]:
# 샘플 리뷰 테스트
sample_reviews = [
    "The movie was amazing, I loved it so much!",
    "This was the worst film I have ever seen."
]
sample_tfidf = vectorizer.transform(sample_reviews)
sample_predictions = xgb_model.predict(sample_tfidf)

In [56]:
# 샘플 리뷰 결과 출력
for review, prediction in zip(sample_reviews, sample_predictions):
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}")
    print("-" * 50)

Review: The movie was amazing, I loved it so much!
Predicted Sentiment: Positive
--------------------------------------------------
Review: This was the worst film I have ever seen.
Predicted Sentiment: Negative
--------------------------------------------------
