# 자연어 처리 - 영화감상평 분석

### 예시 데이터

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# 1. 데이터 로드 (예: 영화 감상평 데이터)
# 감상평 데이터가 있다면 아래와 같이 로드합니다. 데이터는 텍스트(감상평)와 레이블(긍정/부정)이 있어야 합니다.
# 예시 데이터프레임 구조: df['review'] (감상평 텍스트), df['sentiment'] (긍정/부정 레이블)

# 데이터 예시
data = {
    'review': [
        "This movie was fantastic!",
        "I hated every minute of it.",
        "Best movie I have seen in a long time.",
        "The plot was dull and uninteresting.",
        "I really enjoyed this film.",
        "It was a total waste of time."
    ],
    'sentiment': ['positive', 'negative', 'positive', 'negative', 'positive', 'negative']
}

df = pd.DataFrame(data)


In [10]:
# 2. 텍스트 전처리
# 레이블을 숫자로 인코딩
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

# 데이터 나누기 (훈련 및 테스트)
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [11]:
# 3. 토크나이저로 텍스트를 숫자로 변환
tokenizer = Tokenizer(num_words=5000)  # 최대 5000개의 단어만 사용
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# 패딩을 사용해 시퀀스 길이를 맞춤
max_sequence_length = 100  # 최대 시퀀스 길이
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

In [12]:
# 4. RNN 모델 구축
model = Sequential()

# 임베딩 레이어 (단어를 임베딩 벡터로 변환)
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))

# LSTM 레이어 추가 (순환 신경망-> RNN)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# 완전 연결층 (출력)
model.add(Dense(1, activation='sigmoid'))



In [13]:
# 5. 모델 컴파일
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [14]:
# 6. 모델 학습
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2500 - loss: 0.6952 - val_accuracy: 1.0000 - val_loss: 0.6881
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 1.0000 - loss: 0.6882 - val_accuracy: 1.0000 - val_loss: 0.6884
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 1.0000 - loss: 0.6852 - val_accuracy: 0.5000 - val_loss: 0.6889
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 1.0000 - loss: 0.6796 - val_accuracy: 0.5000 - val_loss: 0.6893
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 1.0000 - loss: 0.6699 - val_accuracy: 0.5000 - val_loss: 0.6895


<keras.src.callbacks.history.History at 0x203e570a350>

In [15]:
# 7. 모델 평가
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.5000 - loss: 0.6895
Test Accuracy: 0.5000


### IMDB 영화 리뷰 데이터셋

In [16]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# 1. IMDB 데이터셋 로드
# num_words=10000 : 가장 빈도 높은 10,000개의 단어만 사용
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [17]:
# 2. 데이터 전처리 (패딩)
max_sequence_length = 300  # 리뷰의 최대 길이 설정
X_train_pad = pad_sequences(X_train, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test, maxlen=max_sequence_length)

In [18]:
# 3. RNN 모델 구성
model = Sequential()

# 임베딩 레이어: 단어 인덱스를 고정된 크기의 밀집 벡터로 변환
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))

# LSTM 레이어 추가
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# 출력층 (이진 분류를 위한 sigmoid 활성화 함수)
model.add(Dense(1, activation='sigmoid'))

In [19]:
# 4. 모델 컴파일
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [20]:
# 5. 모델 학습
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 677ms/step - accuracy: 0.7000 - loss: 0.5616 - val_accuracy: 0.8143 - val_loss: 0.4119
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 596ms/step - accuracy: 0.8575 - loss: 0.3450 - val_accuracy: 0.8512 - val_loss: 0.3566
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 638ms/step - accuracy: 0.8653 - loss: 0.3238 - val_accuracy: 0.7942 - val_loss: 0.4471
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 583ms/step - accuracy: 0.8747 - loss: 0.3050 - val_accuracy: 0.8510 - val_loss: 0.3585
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 593ms/step - accuracy: 0.9170 - loss: 0.2191 - val_accuracy: 0.8472 - val_loss: 0.3644


<keras.src.callbacks.history.History at 0x2038d977050>

In [21]:
# 6. 모델 평가
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.8455 - loss: 0.3669
Test Accuracy: 0.8472


# 네이버 영화 감상평 분석

In [1]:
import pandas as pd
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# 1. 데이터 로드
def load_data():
    train_data = pd.read_csv("./nsmc-master/ratings_test.txt", sep='\t').dropna()
    test_data = pd.read_csv("./nsmc-master/ratings_test.txt", sep='\t').dropna()
    return train_data, test_data

In [3]:
# 2. 데이터 전처리
def preprocess_text(text, tokenizer):
    dicpath = "C:/mecab/mecab-ko-dic"  # Adjust this path as necessary
    mecab = Mecab(dicpath) # 형태소 분석기
    tokens = mecab.morphs(text)
    return ' '.join(tokens)

def preprocess_data(train_data, test_data):
    dicpath = "C:/mecab/mecab-ko-dic"  # Adjust this path as necessary
    mecab = Mecab(dicpath)
    train_data['document'] = train_data['document'].apply(lambda x: ' '.join(mecab.morphs(x)))
    test_data['document'] = test_data['document'].apply(lambda x: ' '.join(mecab.morphs(x)))
    return train_data, test_data

In [4]:
# 3. 시퀀스 변환 및 패딩
def prepare_sequences(train_data, test_data, max_words=20000, max_len=100):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_data['document'])

    X_train_seq = tokenizer.texts_to_sequences(train_data['document'])
    X_test_seq = tokenizer.texts_to_sequences(test_data['document'])

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

    return X_train_pad, train_data['label'], X_test_pad, test_data['label'], tokenizer

In [5]:
# 4. 모델 정의
def build_model(input_dim, output_dim=128, input_length=100):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
        LSTM(128, return_sequences=False),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # 긍정/부정 분류
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [6]:
# 5. 감성 예측 함수
def predict_sentiment(model, tokenizer, text, max_len=100):
    dicpath = "C:/mecab/mecab-ko-dic"  # Adjust this path as necessary
    mecab = Mecab(dicpath)
    processed_text = ' '.join(mecab.morphs(text))
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return "긍정" if prediction > 0.5 else "부정"

In [7]:
# 데이터 로드
train_data, test_data = load_data()

# 데이터 전처리
train_data, test_data = preprocess_data(train_data, test_data)

# 시퀀스 준비
max_words = 20000
max_len = 100
X_train, y_train, X_test, y_test, tokenizer = prepare_sequences(train_data, test_data, max_words, max_len)

# 모델 생성
model = build_model(input_dim=max_words, input_length=max_len)
model.summary()

# 모델 학습
model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=64)

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")



Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 49ms/step - accuracy: 0.7261 - loss: 0.5244 - val_accuracy: 0.8366 - val_loss: 0.3693
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 49ms/step - accuracy: 0.8809 - loss: 0.2875 - val_accuracy: 0.8373 - val_loss: 0.3746
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 49ms/step - accuracy: 0.9137 - loss: 0.2264 - val_accuracy: 0.8342 - val_loss: 0.4002
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 49ms/step - accuracy: 0.9326 - loss: 0.1770 - val_accuracy: 0.8314 - val_loss: 0.4487
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 49ms/step - accuracy: 0.9494 - loss: 0.1377 - val_accuracy: 0.8272 - val_loss: 0.4841
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.9612 - loss: 0.1171
Test Accuracy: 0.9353761076927185


In [8]:
# 새로운 댓글 예측
while True:
    new_comment = input("분석할 댓글을 입력하세요 (종료: 'exit'): ")
    if new_comment.lower() == "exit":
        break
    result = predict_sentiment(model, tokenizer, new_comment, max_len)
    print(f"예측 결과: {result}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
예측 결과: 긍정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
예측 결과: 부정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
예측 결과: 부정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
예측 결과: 부정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
예측 결과: 긍정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
예측 결과: 부정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
예측 결과: 부정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
예측 결과: 긍정
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
예측 결과: 부정
