In [1]:
!pip install --upgrade pydantic==1.10.12

Collecting pydantic==1.10.12
  Downloading pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/149.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.3/149.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.10.6
    Uninstalling pydantic-2.10.6:
      Successfully uninstalled pydantic-2.10.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.18 requir

In [2]:
# 캐글 노트북 가상환경에서 작업 -> tensorflow기반 ELMo 버전 맞춰주기
!pip install --upgrade tensorflow_hub kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.9-py3-none-any.whl.metadata (30 kB)
Downloading kagglehub-0.3.9-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kagglehub
  Attempting uninstall: kagglehub
    Found existing installation: kagglehub 0.3.7
    Uninstalling kagglehub-0.3.7:
      Successfully uninstalled kagglehub-0.3.7
Successfully installed kagglehub-0.3.9


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split

# 텐서플로우 허브에서 ELMo 임베딩 로드(임베딩만 텐서플로우에서 불러오고 해당 임베딩 -> 파이토치 모델에 적용용)
elmo = hub.load("https://tfhub.dev/google/elmo/3")


# ELMo 임베딩 생성해주는
def elmo_embedding(text):
    # 텐서플로우 형태에서 elmo.signatures["default"] 하면, 딕셔너리 형태로 word_emb, lstm_output1, lstm_output2, default 출력
    embeddings = elmo.signatures["default"](tf.convert_to_tensor(text)) #텐서로 변환해서 elmo에 투입
    return embeddings["default"].numpy()  #딕셔너리의 default값이 최종 결합된 ELMo 임베딩

# IMDB 데이터셋 처리
class IMDBDataset(Dataset):
    def __init__(self, dataframe):
        self.reviews = dataframe['review'].tolist()  # 리뷰 텍스트 리스트
        self.labels = dataframe['sentiment'].tolist()  # 감정 라벨 리스트

        # ELMo 임베딩 적용 -> 모든 리뷰를 ELMo로 변환하여 벡터화
        self.embeddings = [elmo_embedding([review])[0] for review in self.reviews]

    def __len__(self):
        return len(self.labels)  # 데이터셋 크기 반환

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# 감성 분류 모델 정의(간단한 FFN 설계해봤습니다..)
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(1024, 512)  # 첫 번째 선형층
        self.bn1 = nn.BatchNorm1d(512)  # 배치 정규화
        self.drop1 = nn.Dropout(0.5)  # 드롭아웃
        self.fc2 = nn.Linear(512, 256)  # 두 번째 선형층
        self.bn2 = nn.BatchNorm1d(256)  # 배치 정규화
        self.drop2 = nn.Dropout(0.5)  # 드롭아웃
        self.fc3 = nn.Linear(256, 2)  # 최종적으로 두 개로 분류류

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.drop1(x)
        x = torch.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.drop2(x)
        return self.fc3(x)

# IMDb 데이터 로드 및 전처리(Kaggle 환경경)
imdb_data = pd.read_csv('/content/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})  # 긍부정 1/0으로 변환환

# 훈련 및 테스트 데이터 분할(2000개만 랜덤 샘플링해서 진행행)
train_df, test_df = train_test_split(imdb_data.sample(n=2000, random_state=42), test_size=0.2, random_state=42)

# PyTorch Dataset 생성성
train_dataset = IMDBDataset(train_df)
test_dataset = IMDBDataset(test_df)

# 배치 처리
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # CUDA 설정정
model = SentimentClassifier().to(device)  # 모델 올리기
criterion = nn.CrossEntropyLoss()  # 손실 함수 설정(분류를 위한 크로스 엔트로피)
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam 옵티마이저 사용

# 모델 학습
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # 데이터 GPU로 이동(T4 2장)
        optimizer.zero_grad()  #기울기 초기화
        outputs = model(inputs)
        loss = criterion(outputs, labels)  # 손실 계산
        loss.backward()  # 역전파 수행
        optimizer.step()  # 가중치 업데이트
        total_loss += loss.item()  # 총 손실 누적
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")  # 에포크별 평균 손실 출력

# 모델 저장
torch.save(model.state_dict(), "elmo_imdb_model1.pth")

# 모델 평가
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)  # 가장 높은 확률을 가진 클래스로 예측값 결정
        y_true.extend(labels.cpu().numpy())  # 정답 라벨 저장
        y_pred.extend(predictions.cpu().numpy())  # 예측값 저장

# 평가 지표
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("Classification Report:")
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))  # 정확도 출력 (0.8225)


Epoch 1/5, Loss: 0.5761
Epoch 2/5, Loss: 0.4518
Epoch 3/5, Loss: 0.3868
Epoch 4/5, Loss: 0.3516
Epoch 5/5, Loss: 0.3549
Confusion Matrix:
[[172  37]
 [ 43 148]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       209
           1       0.80      0.77      0.79       191

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

Accuracy: 0.8
