### 주의 : 본 코드는 책에 대한 학습 및 교육외에 배포를 금지합니다.
### Warning: This code is prohibited from distribution except for learning and educational purposes related to the book.
4장 딥러닝을 활용한 화학특성분석
- by Keunhong Jeong

![image.png](attachment:image.png)

In [None]:
import requests

url = 'https://raw.githubusercontent.com/doas1min/CAIP/main/data/Lipophilicity_G2.csv'
response = requests.get(url)

# 파일을 저장합니다.
with open('Lipophilicity_test.csv', 'wb') as f:
    f.write(response.content)

import pandas as pd

# CSV 파일 읽기
data = pd.read_csv('Lipophilicity_test.csv')

# index 변경
data.set_index(['smiles', 'logD'], inplace=True)
data.index.names = ['SMILES', 'Lipophilicity']

# CSV 파일로 저장
data.to_csv('Lipophilicity.csv')

In [None]:
!pip install rdkit==2023.03.01

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# 데이터 살펴보기
print(data.describe())

# Descriptor에 대한 히스토그램
data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']].hist(bins=30, figsize=(15, 10), layout=(2, 2));

# 상관관계 분석
sns.pairplot(data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds', 'Lipophilicity']])

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 데이터 표준화
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLP 모델 학습
model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Morgan fingerprint를 계산하는 함수
def get_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits)

# Descriptor 추출
data['MorganFingerprint'] = data['SMILES'].apply(get_morgan_fingerprint)

# 입력 변수와 타겟 변수 분할
X = np.array(list(data['MorganFingerprint']))
y = data['Lipophilicity']

# 데이터 표준화
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLP 모델 학습
model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Morgan fingerprint를 계산하는 함수
def get_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits)

# 기존 Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# MorganFingerprint 추출
data['MorganFingerprint'] = data['SMILES'].apply(get_morgan_fingerprint)

# Convert the Morgan fingerprint to a numpy array
data['MorganFingerprint'] = data['MorganFingerprint'].apply(lambda x: np.array(x))

# Then, split the array into separate columns in the DataFrame
morgan_df = pd.DataFrame(data['MorganFingerprint'].to_list(), columns=[f'bit{i}' for i in range(1024)])

# Now, concatenate this DataFrame with your existing one
X = pd.concat([data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']], morgan_df], axis=1)

# 데이터 표준화
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# MLP 모델 학습
model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Dense, Input, Concatenate, Conv1D, GlobalAveragePooling1D
from keras.utils import plot_model
import matplotlib.pyplot as plt
import tensorflow as tf

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Morgan fingerprint를 계산하는 함수
def get_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits)

# 기존 Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# MorganFingerprint 추출
data['MorganFingerprint'] = data['SMILES'].apply(get_morgan_fingerprint)

# 입력 변수와 타겟 변수 분할
X_global = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']].values
X_fingerprint = np.array(list(map(list, data['MorganFingerprint'].values)))
y = data['Lipophilicity'].values

# 데이터 표준화
scaler_global = StandardScaler()
X_global = scaler_global.fit_transform(X_global)

scaler_fingerprint = StandardScaler()
X_fingerprint = scaler_fingerprint.fit_transform(X_fingerprint)

# 학습 데이터와 테스트 데이터 분할
X_global_train, X_global_test, X_fingerprint_train, X_fingerprint_test, y_train, y_test = train_test_split(X_global, X_fingerprint, y, test_size=0.2, random_state=42)

# 추가 차원을 만듭니다.
X_fingerprint_train = tf.expand_dims(X_fingerprint_train, axis=2)
X_fingerprint_test = tf.expand_dims(X_fingerprint_test, axis=2)

# 모델 구성
input_global = Input(shape=(X_global.shape[1], ), name='global_input')
input_fingerprint = Input(shape=(X_fingerprint_train.shape[1], 1), name='fingerprint_input')

conv1 = Conv1D(16, 3, activation='relu')(input_fingerprint)
gap1 = GlobalAveragePooling1D()(conv1)

concat = Concatenate()([input_global, gap1])

dense1 = Dense(64, activation='relu')(concat)
output = Dense(1)(dense1)

model = Model(inputs=[input_global, input_fingerprint], outputs=output)

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 구조 시각화
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# 모델 학습
history = model.fit([X_global_train, X_fingerprint_train], y_train, validation_data=([X_global_test, X_fingerprint_test], y_test), epochs=50, batch_size=32)

# 모델 학습 과정 시각화
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.show()

# 모델 평가
model.evaluate([X_global_test, X_fingerprint_test], y_test)

from sklearn.metrics import r2_score

# 모델 평가 2
loss = model.evaluate([X_global_test, X_fingerprint_test], y_test)
print("Test Loss:", loss)

# 예측값 계산
y_pred = model.predict([X_global_test, X_fingerprint_test])

# R^2 점수 계산
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# 예측값과 실제값 비교 시각화
plt.figure(figsize=(5,5))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES 문자열을 추출
smiles_list = data['SMILES'].values.tolist()

# SMILES 문자열을 토큰화하고 정수 인코딩
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(smiles_list)
sequences = tokenizer.texts_to_sequences(smiles_list)

# 시퀀스 패딩
max_length = max([len(seq) for seq in sequences])
X_smiles = pad_sequences(sequences, maxlen=max_length, padding='post')

# SMILES 데이터를 원-핫 인코딩
X_smiles = to_categorical(X_smiles)

# Lipophilicity를 타겟 변수로 설정
y = data['Lipophilicity'].values

# 트레이닝셋과 테스트셋 분리
X_train, X_test, y_train, y_test = train_test_split(X_smiles, y, test_size=0.2, random_state=42)

# RNN 모델 구성
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))
lstm_layer1 = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.2)(input_layer)
lstm_layer2 = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.2)(lstm_layer1)
lstm_layer3 = LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.2)(lstm_layer2)
output_layer = Dense(1, activation='linear')(lstm_layer3)

# 모델 컴파일
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(), loss='mean_squared_error')

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32)

# 테스트셋에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 결과 평가 (Mean Squared Error와 R2 Score)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 실제값 대비 예측값 시각화
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')
p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
!pip install torch_geometric

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import DataLoader
from torch.nn import Linear
from torch_geometric.utils import dense_to_sparse
from torch_geometric.data import Data
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 데이터 불러오기
data = pd.read_csv('Lipophilicity.csv')

# rdkit를 사용해 SMILES 문자열을 그래프 형태로 변환
# 원자와 결합의 최대 개수 설정
max_atoms = max([Chem.MolFromSmiles(smiles).GetNumAtoms() for smiles in data['SMILES']])


def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    features = [atom.GetAtomicNum() for atom in mol.GetAtoms()]

    # 패딩 수행
    n_atoms = len(features)
    features.extend([0]*(max_atoms-n_atoms))
    adj_padding = np.zeros((max_atoms, max_atoms))
    adj_padding[:n_atoms, :n_atoms] = adj

    return adj_padding, features

graphs = [smiles_to_graph(smiles) for smiles in data['SMILES']]
X = [torch.tensor(features, dtype=torch.float).view(-1, 1) for _, features in graphs]
A = [torch.tensor(adj, dtype=torch.long) for adj, _ in graphs]
y = torch.tensor(data['Lipophilicity'].values, dtype=torch.float).view(-1, 1)

class GraphDataset(Dataset):
    def __init__(self, X, A, y):
        self.X = X
        self.A = A
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        edge_index, _ = dense_to_sparse(self.A[idx])  # Convert adjacency matrix to edge_index
        return Data(x=self.X[idx], edge_index=edge_index, y=self.y[idx])


# 트레이닝셋과 테스트셋 분리
X_train, X_test, A_train, A_test, y_train, y_test = train_test_split(X, A, y, test_size=0.2, random_state=42)
train_data = GraphDataset(X_train, A_train, y_train)
test_data = GraphDataset(X_test, A_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# 그래프 신경망(GNN) 모델 구성
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.conv3 = GCNConv(64, 64)
        self.fc = Linear(64, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index).relu()
        x = global_mean_pool(x, data.batch)
        x = self.fc(x)
        return x.view(-1)

# 모델, 손실함수, 최적화 함수 설정
model = GNN()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

# 학습
for epoch in range(100):
    model.train()
    train_losses = []
    for data in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    if epoch % 10 == 0:
        model.eval()
        test_losses = []
        for data in test_loader:
            output = model(data)
            loss = criterion(output, data.y)
            test_losses.append(loss.item())

        print(f'Epoch {epoch}:')
        print('Train Loss:', np.mean(train_losses))
        print('Test Loss:', np.mean(test_losses))
        model.train()

# 평가
model.eval()
y_pred = []
y_true = []
for batch in test_loader:
    output = model(batch)
    y_pred.append(output.detach().numpy().flatten())
    y_true.append(batch.y.numpy().flatten())

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

print('MSE: ', mean_squared_error(y_true, y_pred))
print('R2 score: ', r2_score(y_true, y_pred))


# 결과 시각화
plt.figure(figsize=(8, 8))
plt.scatter(y_true, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')
p1 = max(max(y_pred), max(y_true))
p2 = min(min(y_pred), min(y_true))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import DataLoader
from torch.nn import Linear
from torch_geometric.utils import dense_to_sparse
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, Descriptors
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 데이터 불러오기
data = pd.read_csv('Lipophilicity.csv')

# rdkit를 사용해 SMILES 문자열을 그래프 형태로 변환
# 원자와 결합의 최대 개수 설정
max_atoms = max([Chem.MolFromSmiles(smiles).GetNumAtoms() for smiles in data['SMILES']])

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    features = [atom.GetAtomicNum() for atom in mol.GetAtoms()]

    # 패딩 수행
    n_atoms = len(features)
    features.extend([0]*(max_atoms-n_atoms))
    adj_padding = np.zeros((max_atoms, max_atoms))
    adj_padding[:n_atoms, :n_atoms] = adj

    return adj_padding, features

graphs = [smiles_to_graph(smiles) for smiles in data['SMILES']]
X = [torch.tensor(features, dtype=torch.float).view(-1, 1) for _, features in graphs]
A = [torch.tensor(adj, dtype=torch.long) for adj, _ in graphs]
y = torch.tensor(data['Lipophilicity'].values, dtype=torch.float).view(-1, 1)

# 글로벌 특성 추가
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))


# 글로벌 특성 데이터를 텐서로 변환
global_features = torch.tensor(data[['TPSA', 'LogP', 'MolecularWeight','NumRotatableBonds']].values, dtype=torch.float)

class GraphDataset(Dataset):
    def __init__(self, X, A, y, global_features):
        self.X = X
        self.A = A
        self.y = y
        self.global_features = global_features

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        edge_index, _ = dense_to_sparse(self.A[idx])  # Convert adjacency matrix to edge_index
        return Data(x=self.X[idx], edge_index=edge_index, y=self.y[idx], global_features=self.global_features[idx].unsqueeze(0))




# 트레이닝셋과 테스트셋 분리
X_train, X_test, A_train, A_test, y_train, y_test, global_train, global_test = train_test_split(X, A, y, global_features, test_size=0.2, random_state=42)
train_data = GraphDataset(X_train, A_train, y_train, global_train)
test_data = GraphDataset(X_test, A_test, y_test, global_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


# 그래프 신경망(GAT) 모델 구성
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GATConv(1, 64)
        self.conv2 = GATConv(64, 64)
        self.conv3 = GATConv(64, 64)
        self.global_feature_layer = Linear(4, 64)  # 추가: global feature layer
        self.fc = Linear(128, 1)

    def forward(self, data):
        x, edge_index, global_features = data.x, data.edge_index, data.global_features  # 수정
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index).relu()
        x = global_mean_pool(x, data.batch)
        global_features = self.global_feature_layer(global_features).relu()  # 추가: global feature 처리

        x = torch.cat([x, global_features], dim=1)  # 수정: local feature와 global feature 결합
        x = self.fc(x)
        return x.view(-1)

# 모델, 손실함수, 최적화 함수 설정
model = GNN()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)



# 학습
for epoch in range(30):
    model.train()
    train_losses = []
    for data in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    if epoch % 10 == 0:
        model.eval()
        test_losses = []
        for data in test_loader:
            output = model(data)
            loss = criterion(output, data.y)
            test_losses.append(loss.item())

        print(f'Epoch {epoch}:')
        print('Train Loss:', np.mean(train_losses))
        print('Test Loss:', np.mean(test_losses))
        model.train()

# 평가
model.eval()
y_pred = []
y_true = []
for batch in test_loader:
    output = model(batch)
    y_pred.append(output.detach().numpy().flatten())
    y_true.append(batch.y.numpy().flatten())

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

print('MSE: ', mean_squared_error(y_true, y_pred))
print('R2 score: ', r2_score(y_true, y_pred))


# 결과 시각화
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')
p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()
