In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import nltk
import re
import matplotlib.pyplot as plt
import dgl

from allennlp.modules.elmo import Elmo, batch_to_ids
from eunjeon import Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from dgl.data.utils import load_graphs
from sklearn.metrics import accuracy_score, recall_score, f1_score


  warn(f"Failed to load image Python extension: {e}")


ImportError: DLL load failed while importing _errors: 메모리 리소스가 부족하여 이 명령을 처리할 수 없습니다.

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# mecab 모델 다운로드
mecab = Mecab()

In [None]:
# ELMo 모델 초기화
options_file = "../analysis_files/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weight_file = "../analysis_files/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
elmo = Elmo(options_file, weight_file, num_output_representations=1)

In [None]:
# 한국어 문장 형태소 분리 함수
def tokenize_korean_sentence(sentence):
    tokens = mecab.morphs(sentence)
    return tokens

def preprocessing(sentences) :
    sentences = re.sub(r'\([^)]*\)', '', sentences)
    sentences = sentences.replace('.', '')
    sentences = re.sub(r'[^가-힣\s]', '', sentences)
    sentences = re.sub(r'\b(?:cm|km|etc)\b', '', sentences) 
    return sentences

# 입력 문장
sentences = [
    '한국어 문장을 토큰화합니다.',
    'ELMo를 적용하기 위해 문장을 변환해야 합니다.',
    '제대로 된거 맞는거겟지?'
]

tokenized_sentences = []

# 문장 임베딩 수행
embeddings = []
for sentence in sentences:
    sentence = preprocessing(sentence)
    tokens = tokenize_korean_sentence(sentence)
    character_ids = batch_to_ids([tokens])
    embeddings_output = elmo(character_ids)
    sentence_embedding = embeddings_output["elmo_representations"][0][0]
    embeddings.append(sentence_embedding)
    
# 임베딩 결과 출력
# for i, embedding in enumerate(embeddings):
#     print(f"Sentence {i+1} embedding shape:", embedding.shape)

for i, (sentence, embedding) in enumerate(zip(sentences, embeddings)):
    print(f"Sentence {i+1}:")
    print("Text:", sentence)
    print("Embedding shape:", embedding.shape)
    print("Embedded words:")
    for word, emb in zip(tokenize_korean_sentence(sentence), embedding):
        print(f"{word}: {emb}")
    print()


In [None]:
data_dir = '../analysis_files/files/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 감성대화말뭉치(최종데이터)_Training.csv 파일을 pandas로 읽어옵니다.
df = pd.read_csv(os.path.join(data_dir, '감성대화말뭉치(최종데이터)_Training.csv'), encoding='cp949')
df = df[['사람문장1', '시스템문장1', '사람문장2','시스템문장2','사람문장3','시스템문장3','감정_대분류', '상황키워드']]

In [None]:
# 한국어 문장 형태소 분리 함수
def tokenize_korean_sentence(sentence):
    tokens = mecab.morphs(sentence)
    return tokens

In [None]:
def preprocessing(sentences) :
    sentences = re.sub(r'\([^)]*\)', '', sentences)
    sentences = sentences.replace('.', '')
    sentences = re.sub(r'[^가-힣\s]', '', sentences)
    sentences = re.sub(r'\b(?:cm|km|etc)\b', '', sentences) 
    return sentences

In [None]:
# 그래프 생성 및 노드 추가
graphs = []
for _, row in df.iterrows():
    G = nx.Graph()
    for column in ['사람문장1', '시스템문장1', '사람문장2', '시스템문장2', '사람문장3', '시스템문장3']:
        sentence = row[column]
        print(sentence)
        if pd.isna(sentence):  # NaN 값 처리
            sentence_embedding = np.zeros(1024)  # 0 벡터로 처리
        else:
            sentence = preprocessing(sentence)
            # 문장을 형태소로 분리
            tokens = tokenize_korean_sentence(sentence)
            # 문장을 ELMo 임베딩으로 변환
            character_ids = batch_to_ids([tokens])
            embeddings_output = elmo(character_ids)
            sentence_embedding = torch.mean(embeddings_output["elmo_representations"][0], dim=0)
        G.add_node(sentence, feature=sentence_embedding)

    G.add_edge(row['사람문장1'], row['시스템문장1'])
    G.add_edge(row['사람문장2'], row['시스템문장2'])
    G.add_edge(row['사람문장3'], row['시스템문장3'])

    graphs.append(G)

In [None]:
# 그래프 시각화
for i in range(4):  # 4개의 그래프만 표시
    graph = graphs[i]
    pos = nx.spring_layout(graph)  # 그래프의 노드 위치 결정
    nx.draw(graph, pos, with_labels=True, node_color='lightblue', edge_color='gray', font_size=8)  # 그래프 그리기
    node_labels = nx.get_node_attributes(graph, 'feature')  # 노드의 임베딩 정보 가져오기
    for node, emb in node_labels.items():
        x, y = pos[node]
        plt.text(x, y, s=node, bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.2'), fontsize=8)
    plt.show()

In [None]:
def extract_sentence_features(sentence):
    # TF-IDF 벡터화 객체 생성
    tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")

    # 문장 길이
    sentence_length = len(sentence)

    # 품사 개수
    tokens = mecab.pos(sentence)
    pos_tags = [tag for _, tag in tokens]
    num_pos_tags = len(pos_tags)

    # 명사 추출
    nouns = mecab.nouns(sentence)

    # TF-IDF 벡터화 및 상위 3개 단어 추출
    top_words = ['', '', '']  # 단어가 없을 경우 빈 문자열로 초기화
    top_scores = [0.0, 0.0, 0.0]  # 단어가 없을 경우 TF-IDF 스코어를 0.0으로 초기화
    top_word_vectors = np.zeros((3,))  # 단어가 없을 경우 0 벡터로 초기화

    if nouns:
        tfidf_matrix = tfidf_vectorizer.fit_transform(nouns)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix.toarray()[0]
        top_indices = np.argsort(tfidf_scores)[-3:][::-1]  # 상위 3개 단어의 인덱스 추출
        top_words = [feature_names[index] for index in top_indices]  # 상위 3개 단어 추출
        top_scores = [tfidf_scores[index] for index in top_indices]  # 상위 3개 단어의 TF-IDF 스코어 추출
        top_word_vectors = [tfidf_matrix.toarray()[0][index] for index in top_indices]  # 상위 3개 단어의 벡터값 추출

    return sentence_length, num_pos_tags, top_words, top_scores, top_word_vectors

In [None]:
# 문장 간의 연결 정보를 나타내는 엣지 정보 생성
def create_edge_index(sentences):
    num_sentences = len(sentences)
    edges = []
    for i in range(num_sentences - 1):
        edges.append((i, i + 1))
    return edges

In [None]:
data_train = []
data_test = []

# 레이블을 정수형으로 변환
label_mapping_emotion = {'기쁨': 0, '당황': 1, '분노': 2, '불안' : 3, '상처' : 4,'슬픔' : 5}  # 감정에 해당하는 레이블과 정수 매핑
label_mapping_situation = {'가족관계': 0, '건강': 1, '건강,죽음': 2, '대인관계' : 3, '대인관계(부부, 자녀)' : 4, '연애,결혼,출산' : 5, '재정' : 6, '재정,은퇴,노후준비' : 7, '직장, 업무 스트레스' : 8, '진로,취업,직장' : 9, '학교폭력/따돌림' : 10, '학업 및 진로' : 11}  # 상황에 해당하는 레이블과 정수 매핑

In [None]:
# 그래프 생성 및 노드 추가
graphs = []
for _, row in df.iterrows():
    G = nx.Graph()
    for column in ['사람문장1', '시스템문장1', '사람문장2', '시스템문장2', '사람문장3', '시스템문장3']:
        sentence = row[column]
        print(sentence)
        if pd.isna(sentence):  # NaN 값 처리
            sentence_embedding = np.zeros(1024)  # 0 벡터로 처리
        else:
            sentence = preprocessing(sentence)
            # 문장을 형태소로 분리
            tokens = tokenize_korean_sentence(sentence)
            # 문장을 ELMo 임베딩으로 변환
            character_ids = batch_to_ids([tokens])
            embeddings_output = elmo(character_ids)
            sentence_embedding = torch.mean(embeddings_output["elmo_representations"][0], dim=0)
            # 문장 특징 추출
            sentence_length, num_pos_tags, top_words, top_scores, top_word_vectors = extract_sentence_features(sentence)
            

        G.add_node(sentence, feature=sentence_embedding, length=sentence_length, pos_tags=num_pos_tags,
                top_words=top_words, top_scores=top_scores, top_word_vectors=top_word_vectors)

        
    # 감정 및 상황 레이블 할당
    y_emotion = label_mapping_emotion[row['감정_대분류']]
    y_situation = label_mapping_situation[row['상황키워드']]
    
    G.add_edge(row['사람문장1'], row['시스템문장1'])
    G.add_edge(row['사람문장2'], row['시스템문장2'])
    G.add_edge(row['사람문장3'], row['시스템문장3'])

    graphs.append(G)

In [None]:
# 그래프를 DGL 그래프로 변환
dgl_graphs = []
for G in graphs:
    dgl_graphs.append(dgl.from_networkx(G))

In [None]:
# 데이터 분할
batch_size = 32
train_graphs, test_graphs = train_test_split(dgl_graphs, test_size=0.2, random_state=42)
data_train = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
data_test = DataLoader(test_graphs, batch_size=batch_size, shuffle=True)

In [None]:
class GCNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = dgl.nn.GraphConv(input_dim, hidden_dim)
        self.conv2 = dgl.nn.GraphConv(hidden_dim, hidden_dim)
        self.conv3 = dgl.nn.GraphConv(hidden_dim, hidden_dim)
        self.fc_emotion = nn.Linear(hidden_dim, output_dim['emotion'])
        self.fc_situation = nn.Linear(hidden_dim, output_dim['situation'])

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = x.mean(dim=0)  # 그래프의 특성을 하나의 벡터로 요약
        emotion_out = self.fc_emotion(x)
        situation_out = self.fc_situation(x)

        return emotion_out, situation_out

In [None]:
# 학습 모델 초기화 및 손실 함수, 옵티마이저 설정
input_dim = 1024
hidden_dim = 128
output_dim = {'emotion': len(label_mapping_emotion), 'situation': len(label_mapping_situation)}
model = GCNModel(input_dim, hidden_dim, output_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 평가 함수 정의
def evaluate(model, dataloader):
    model.eval()
    y_true_emotion = []
    y_pred_emotion = []
    y_true_situation = []
    y_pred_situation = []

    with torch.no_grad():
        for batch in dataloader:
            features = batch.ndata['feature']
            edge_index = batch.edges()[0]
            labels_emotion = batch.ndata['y_emotion']
            labels_situation = batch.ndata['y_situation']

            outputs_emotion, outputs_situation = model(features, edge_index)
            _, predicted_emotion = torch.max(outputs_emotion, 1)
            _, predicted_situation = torch.max(outputs_situation, 1)

            y_true_emotion.extend(labels_emotion.tolist())
            y_pred_emotion.extend(predicted_emotion.tolist())
            y_true_situation.extend(labels_situation.tolist())
            y_pred_situation.extend(predicted_situation.tolist())

    accuracy_emotion = accuracy_score(y_true_emotion, y_pred_emotion)
    recall_emotion = recall_score(y_true_emotion, y_pred_emotion, average='macro')
    f1_emotion = f1_score(y_true_emotion, y_pred_emotion, average='macro')
    accuracy_situation = accuracy_score(y_true_situation, y_pred_situation)
    recall_situation = recall_score(y_true_situation, y_pred_situation, average='macro')
    f1_situation = f1_score(y_true_situation, y_pred_situation, average='macro')

    return accuracy_emotion, recall_emotion, f1_emotion, accuracy_situation, recall_situation, f1_situation


In [None]:
# 모델 학습
num_epochs = 10

for epoch in range(num_epochs):
    model.train()

    for batch in data_train:
        features = batch.ndata['feature']
        edge_index = batch.edges()[0]
        labels_emotion = batch.ndata['y_emotion']
        labels_situation = batch.ndata['y_situation']

        optimizer.zero_grad()
        outputs_emotion, outputs_situation = model(features, edge_index)
        loss_emotion = loss_function(outputs_emotion, labels_emotion)
        loss_situation = loss_function(outputs_situation, labels_situation)
        loss = loss_emotion + loss_situation
        loss.backward()
        optimizer.step()

    # 학습 중간 평가
    accuracy_emotion_train, recall_emotion_train, f1_emotion_train, \
    accuracy_situation_train, recall_situation_train, f1_situation_train = evaluate(model, data_train)

    # 테스트 데이터 평가
    accuracy_emotion_test, recall_emotion_test, f1_emotion_test, \
    accuracy_situation_test, recall_situation_test, f1_situation_test = evaluate(model, data_test)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print("Train - Emotion Accuracy: {:.2f}%, Recall: {:.2f}, F1-score: {:.2f}".format(
        accuracy_emotion_train * 100, recall_emotion_train, f1_emotion_train))
    print("Train - Situation Accuracy: {:.2f}%, Recall: {:.2f}, F1-score: {:.2f}".format(
        accuracy_situation_train * 100, recall_situation_train, f1_situation_train))
    print("Test - Emotion Accuracy: {:.2f}%, Recall: {:.2f}, F1-score: {:.2f}".format(
        accuracy_emotion_test * 100, recall_emotion_test, f1_emotion_test))
    print("Test - Situation Accuracy: {:.2f}%, Recall: {:.2f}, F1-score: {:.2f}".format(
        accuracy_situation_test * 100, recall_situation_test, f1_situation_test))
    print()

In [None]:
# 학습 모델
def train(model, optimizer, data_loader, task):
    model.train()  # Set the model to train mode
    total_loss = 0

    for data in data_loader:
        optimizer.zero_grad()  # Initialize gradients

        x, edge_index, labels = data.x, data.edge_index, data.y
        if task == 'emotion':
            labels = data.y_emotion
        elif task == 'situation':
            labels = data.y_situation

        out_emotion, out_situation = model(x, edge_index)  # Separate outputs for emotion and situation tasks

        if task == 'emotion':
            out = out_emotion
        elif task == 'situation':
            out = out_situation

        # Compute the loss function
        loss = F.cross_entropy(out, labels)

        # Backpropagation and weight updates
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)

    return total_loss / len(data_loader.dataset)

In [None]:
# 평가 함수
def evaluate(model, data_loader, task):
    model.eval()  # 모델을 평가 모드로 설정
    total_correct = 0
    total_f1 = 0

    with torch.no_grad():
        for data in data_loader:

            # 데이터 배치에서 입력과 정답을 가져옴
            x, edge_index, labels = data.x, data.edge_index, data.y
            if task == 'emotion':
                labels = data.y_emotion
                max_sentence_length  = 6
            elif task == 'situation':
                labels = data.y_situation
                max_sentence_length = 12
                            
            labels = torch.nn.functional.pad(labels, (0, max_sentence_length - labels.shape[0]), value=-1)

            # 모델의 출력 계산
            out_emotion, out_situation = model(x, edge_index)

            if task == 'emotion':
                out = out_emotion
            elif task == 'situation':
                out = out_situation
            
            # 정확도 계산
            _, pred = torch.max(out.unsqueeze(1), dim=1)
            correct = pred.eq(labels).sum().item()
            total_correct += correct

            # F1 점수 계산
            f1 = f1_score(labels.cpu().numpy(), pred.cpu().numpy(), average='macro')

            total_f1 += f1

    accuracy = total_correct / len(data_loader.dataset)
    f1_result = total_f1 / len(data_loader.dataset)

    return accuracy, f1_result

In [None]:
# 문장 간의 관계를 나타내는 그래프 데이터 생성
data_list = []

for i, row in df.iterrows():
    sentences = [row['사람문장1'], row['시스템문장1'], row['사람문장2'], row['시스템문장2'], row['사람문장3'], row['시스템문장3']]
    sentences = [sentence for sentence in sentences if pd.notnull(sentence)]  # Remove NaN sentences
    sentences = preprocessing(sentences)

    # 문장 특징 추출
    sentence_lengths, pos_counts, tfidf_features = extract_sentence_features(sentences)

    # 문장 간의 연결 정보를 나타내는 엣지 정보 생성
    edges = create_edge_index(sentences)

    # 문장들을 노드로 가지는 그래프 생성
    graph = nx.Graph()
    
    # 그래프 노드에 문장 특징 추가
    for j, sentence in enumerate(sentences):
        node = {
            'sentence': sentence,
            'sentence_length': sentence_lengths[j],
            'pos_count': pos_counts[j],
            'tfidf_features': tfidf_features[j]
        }
        graph.add_node(j, **node )

    graph.add_edges_from(edges)
    
    adj_matrix = nx.adjacency_matrix(graph)  # 그래프의 인접 행렬을 얻습니다.
    adj_matrix = adj_matrix.todense()  # 행렬을 밀집 행렬로 변환합니다.
    adj_matrix = torch.FloatTensor(adj_matrix)
    
    # 감정 및 상황 레이블 할당
    y_emotion = label_mapping_emotion[row['감정_대분류']]
    y_situation = label_mapping_situation[row['상황키워드']]
    
    # 그래프 데이터에 감정 및 상황 레이블 추가
    data = Data(adj=adj_matrix)  # 데이터 객체 생성
    data.adj = sp.coo_matrix(adj_matrix)  # 인접 행렬을 SciPy 희소 행렬로 변환하여 데이터에 추가합니다.
    data.y_emotion = torch.tensor(y_emotion)
    data.y_situation = torch.tensor(y_situation)
    data_list.append(data)


In [None]:
# 문장 간의 관계를 나타내는 그래프 데이터 생성
data_list = []

for i, row in df.iterrows():
    sentences = [row['사람문장1'], row['시스템문장1'], row['사람문장2'], row['시스템문장2'], row['사람문장3'], row['시스템문장3']]
    sentences = [sentence for sentence in sentences if pd.notnull(sentence)]  # Remove NaN sentences
    sentences = preprocessing(sentences)

    # 문장 특징 추출
    sentence_lengths, pos_counts, tfidf_features = extract_sentence_features(sentences)

    # 문장 간의 연결 정보를 나타내는 엣지 정보 생성
    edges = create_edge_index(sentences)

    # 문장들을 노드로 가지는 그래프 생성
    graph = nx.Graph()
    
    # 그래프 노드에 문장 특징 추가
    for j, sentence in enumerate(sentences):
        node = {
            'sentence': sentence,
            'sentence_length': sentence_lengths[j],
            'pos_count': pos_counts[j],
            'tfidf_features': tfidf_features[j]
        }
        graph.add_node(j, **node )

    graph.add_edges_from(edges)
    
    adj_matrix = nx.adjacency_matrix(graph)  # 그래프의 인접 행렬을 얻습니다.
    adj_matrix = adj_matrix.todense()  # 행렬을 밀집 행렬로 변환합니다.
    adj_matrix = torch.FloatTensor(adj_matrix)
    
    # 감정 및 상황 레이블 할당
    y_emotion = label_mapping_emotion[row['감정_대분류']]
    y_situation = label_mapping_situation[row['상황키워드']]
    
    # 그래프 데이터에 감정 및 상황 레이블 추가
    data = Data(adj=adj_matrix)  # 데이터 객체 생성
    data.adj = sp.coo_matrix(adj_matrix)  # 인접 행렬을 SciPy 희소 행렬로 변환하여 데이터에 추가합니다.
    data.y_emotion = torch.tensor(y_emotion)
    data.y_situation = torch.tensor(y_situation)
    data_list.append(data)


In [None]:
for i, data in enumerate(data_list):
    print(f"Graph {i+1}:")
    adj_matrix = data.adj.toarray()  # Convert the sparse adjacency matrix to a dense numpy array
    graph = nx.from_numpy_array(adj_matrix)  # Create a NetworkX graph from the adjacency matrix
    
    # Print node values
    for node in graph.nodes:
        node_attrs = graph.nodes[node]
        print(f"Node {node}: {node_attrs}")
    print()

In [None]:
def visualize_graphs(data_list, num_graphs_to_visualize=5):
    num_graphs = 0

    for data in data_list:
        # 그래프 생성
        graph = nx.Graph()
        graph.add_nodes_from(range(data.adj.shape[0]))
        graph.add_edges_from(zip(data.adj.row, data.adj.col))

        # 그래프 시각화
        plt.figure(figsize=(4, 3))
        pos = nx.spring_layout(graph)  # 그래프 레이아웃 설정
        nx.draw(graph, pos, with_labels=False, node_size=500, font_size=10)  # 그래프 시각화

        # 노드 정보 표시
        node_labels = {}
        for node in graph.nodes:
            node_labels[node] = f"Node {node}\n"

            if 'sentence_length' in graph.nodes[node]:
                node_labels[node] += f"Sentence Length: {graph.nodes[node]['sentence_length']}\n"

            if 'pos_count' in graph.nodes[node]:
                node_labels[node] += f"POS Count: {graph.nodes[node]['pos_count']}\n"

            if 'tfidf_features' in graph.nodes[node]:
                node_labels[node] += f"TF-IDF Features: {graph.nodes[node]['tfidf_features']}\n"

            node_labels[node] += f"y_emotion: {data.y_emotion}\n"
            node_labels[node] += f"y_situation: {data.y_situation}"

        nx.draw_networkx_labels(graph, pos, labels=node_labels, font_size=8, verticalalignment='center')  # 노드 정보 표시

        plt.show()

        num_graphs += 1
        if num_graphs >= num_graphs_to_visualize:
            break


In [None]:

def visualize_graphs2(data_list, num_graphs_to_visualize=5):
    for i, data in enumerate(data_list):
        if i >= num_graphs_to_visualize:
            break
        
        adj_matrix = data.adj.toarray()  # Convert adjacency matrix to a dense matrix
        graph = nx.from_numpy_array(adj_matrix)  # Create a NetworkX graph from the adjacency matrix
        
        # Retrieve node attributes
        sentence_lengths = [graph.nodes[n]['sentence_length'] for n in graph.nodes]
        pos_counts = [graph.nodes[n]['pos_count'] for n in graph.nodes]
        tfidf_features = [graph.nodes[n]['tfidf_features'] for n in graph.nodes]
        
        # Retrieve emotion and situation labels
        y_emotion = data.y_emotion.item()
        y_situation = data.y_situation.item()
        
        # Visualize the graph
        plt.figure(figsize=(8, 6))
        plt.title(f"Graph {i+1} (Emotion: {y_emotion}, Situation: {y_situation})")
        nx.draw(graph, with_labels=True, node_size=500, node_color='lightblue', font_size=10)
        
        # Add node attributes as labels
        for j, node in enumerate(graph.nodes):
            label = f"Length: {sentence_lengths[j]}\nPOS Count: {pos_counts[j]}\nTF-IDF: {tfidf_features[j]}"
            plt.annotate(label, xy=(0, 0), xytext=(graph.nodes[node]['pos'][0], graph.nodes[node]['pos'][1] - 0.1),
                         textcoords='axes fraction', fontsize=8, ha='center', va='center')
        
        plt.axis('off')
        plt.show()
        
        
# 그래프 데이터 일부 시각화 (예: 3개 그래프)
visualize_graphs2(data_list, num_graphs_to_visualize=3)

In [None]:
batch_size = 32
data_train, data_test = train_test_split(data_list, test_size=0.2, random_state=42)
data_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)
data_test = DataLoader(data_test, batch_size=batch_size, shuffle=True)

In [None]:
print(len(data_train))
print(len(data_test))

In [None]:
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        self.fc_emotion = nn.Linear(hidden_dim, output_dim['emotion'])
        self.fc_situation = nn.Linear(hidden_dim, output_dim['situation'])

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = x.mean(dim=0)  # 그래프의 특성을 하나의 벡터로 요약
        emotion_out = self.fc_emotion(x)
        situation_out = self.fc_situation(x)

        return emotion_out, situation_out

In [None]:
# class GNNModel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(GNNModel, self).__init__()
#         self.conv1 = SAGEConv(input_dim, hidden_dim)
#         self.conv2 = SAGEConv(hidden_dim, hidden_dim)
#         self.fc_emotion = nn.Linear(hidden_dim, output_dim['emotion'])
#         self.fc_situation = nn.Linear(hidden_dim, output_dim['situation'])

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index)
#         x = F.relu(x)
#         x = self.conv2(x, edge_index)
#         x = F.relu(x)
#         x = x.mean(dim=0)  # 그래프의 특성을 하나의 벡터로 요약
#         emotion_out = self.fc_emotion(x)
#         situation_out = self.fc_situation(x)

#         return emotion_out, situation_out


In [None]:
# 학습 모델
def train(model, optimizer, data_loader, task):
    model.train()  # Set the model to train mode
    total_loss = 0

    for data in data_loader:
        optimizer.zero_grad()  # Initialize gradients

        x, edge_index, labels = data.x, data.edge_index, data.y
        if task == 'emotion':
            labels = data.y_emotion
        elif task == 'situation':
            labels = data.y_situation

        out_emotion, out_situation = model(x, edge_index)  # Separate outputs for emotion and situation tasks

        if task == 'emotion':
            out = out_emotion
        elif task == 'situation':
            out = out_situation

        # Compute the loss function
        loss = F.cross_entropy(out, labels)

        # Backpropagation and weight updates
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)

    return total_loss / len(data_loader.dataset)

In [None]:
# 평가 함수
def evaluate(model, data_loader, task):
    model.eval()  # 모델을 평가 모드로 설정
    total_correct = 0
    total_f1 = 0

    with torch.no_grad():
        for data in data_loader:

            # 데이터 배치에서 입력과 정답을 가져옴
            x, edge_index, labels = data.x, data.edge_index, data.y
            if task == 'emotion':
                labels = data.y_emotion
                max_sentence_length  = 6
            elif task == 'situation':
                labels = data.y_situation
                max_sentence_length = 12
                            
            labels = torch.nn.functional.pad(labels, (0, max_sentence_length - labels.shape[0]), value=-1)

            # 모델의 출력 계산
            out_emotion, out_situation = model(x, edge_index)

            if task == 'emotion':
                out = out_emotion
            elif task == 'situation':
                out = out_situation
            
            # 정확도 계산
            _, pred = torch.max(out.unsqueeze(1), dim=1)
            correct = pred.eq(labels).sum().item()
            total_correct += correct

            # F1 점수 계산
            f1 = f1_score(labels.cpu().numpy(), pred.cpu().numpy(), average='macro')

            total_f1 += f1

    accuracy = total_correct / len(data_loader.dataset)
    f1_result = total_f1 / len(data_loader.dataset)

    return accuracy, f1_result

In [None]:
# 데이터 로더 설정
emotion_batch_size = 6
situation_batch_size = 12

emotion_train_dataset = data_train
situation_train_dataset = data_train
emotion_test_dataset = data_test
situation_test_dataset = data_test

emotion_train_loader = DataLoader(emotion_train_dataset, batch_size=emotion_batch_size, shuffle=True)
situation_train_loader = DataLoader(situation_train_dataset, batch_size=situation_batch_size, shuffle=True)
emotion_test_loader = DataLoader(emotion_test_dataset, batch_size=emotion_batch_size, shuffle=False)
situation_test_loader = DataLoader(situation_test_dataset, batch_size=situation_batch_size, shuffle=False)

In [None]:
# 모델 훈련
num_epochs = 1

# 모델 초기화
input_dim = 102  # 입력 특성의 차원
hidden_dim = 32  # 은닉 상태의 차원
output_dim = {'emotion': 6, 'situation': 12}  # 출력의 차원 (감정: 6개 클래스, 상황: 12개 클래스)

learning_rates = [0.001, 0.01, 0.1]
weight_decays = [1e-4, 5e-4, 1e-3]

best_accuracy = 0.0
best_f1 = 0.0
best_learning_rate = None
best_weight_decay = None

for lr in learning_rates:
    for wd in weight_decays:
        # 모델 초기화
        model = GNNModel(input_dim, hidden_dim, output_dim)

        # 최적화 설정
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

        # 모델 훈련
        for epoch in range(num_epochs):
            # 감정 학습
            model.train()
            emotion_train_loss = train(model, optimizer, emotion_train_loader, task='emotion')

            # 감정 평가
            emotion_acc, emotion_f1 = evaluate(model, emotion_test_loader, task='emotion')

            # 상황 학습
            model.train()
            situation_train_loss = train(model, optimizer, situation_train_loader, task='situation')

            # 상황 평가
            situation_acc, situation_f1 = evaluate(model, situation_test_loader, task='situation')

            # 결과 출력
            print(f"Epoch [{epoch+1}/{num_epochs}]")
            print(f"Emotion - Train Loss: {emotion_train_loss:.4f}, Acc: {emotion_acc:.4f}, F1: {emotion_f1:.4f}")
            print(f"Situation - Train Loss: {situation_train_loss:.4f}, Acc: {situation_acc:.4f}, F1: {situation_f1:.4f}")
            print("--------------------------------------------------")

        # 최고 성능인 경우 기록
        if emotion_acc > best_accuracy:
            best_accuracy = emotion_acc
            best_f1 = emotion_f1
            best_learning_rate = lr
            best_weight_decay = wd

print("Grid Search Results:")
print(f"Best Learning Rate: {best_learning_rate}")
print(f"Best Weight Decay: {best_weight_decay}")
print(f"Best Emotion Accuracy: {best_accuracy}")
print(f"Best Emotion F1 Score: {best_f1}")