### 주의 : 본 코드는 책에 대한 학습 및 교육외에 배포를 금지합니다.
### Warning: This code is prohibited from distribution except for learning and educational purposes related to the book.
4장 딥러닝을 활용한 화학특성분석
- by Keunhong Jeong

![image.png](attachment:image.png)

In [None]:
import requests

url = 'https://raw.githubusercontent.com/doas1min/CAIP/main/data/Lipophilicity_G2.csv'
response = requests.get(url)

# 파일을 저장합니다.
with open('Lipophilicity_test.csv', 'wb') as f:
    f.write(response.content)

import pandas as pd

# CSV 파일 읽기
data = pd.read_csv('Lipophilicity_test.csv')

# index 변경
data.set_index(['smiles', 'logD'], inplace=True)
data.index.names = ['SMILES', 'Lipophilicity']

# CSV 파일로 저장
data.to_csv('Lipophilicity.csv')

In [None]:
!pip install rdkit

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES -> Mol 변환 (파싱 실패 처리)
data["Mol"] = data["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", data["Mol"].isna().sum())
data = data.dropna(subset=["Mol"]).reset_index(drop=True)

# Descriptor 추출
data['TPSA'] = data['Mol'].apply(Descriptors.TPSA)
data['LogP'] = data['Mol'].apply(Descriptors.MolLogP)
data['MolecularWeight'] = data['Mol'].apply(Descriptors.MolWt)
data['NumRotatableBonds'] = data['Mol'].apply(Descriptors.NumRotatableBonds)

# 데이터 살펴보기
print(data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds', 'Lipophilicity']].describe())

# Descriptor에 대한 히스토그램
data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']].hist(
    bins=30, figsize=(15, 10), layout=(2, 2)
)
plt.show()

# 상관관계 분석
sns.pairplot(data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds', 'Lipophilicity']])
plt.show()

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 데이터 표준화
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# MLP 모델 학습
model = MLPRegressor(
    hidden_layer_sizes=(64, 64),
    max_iter=2000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20
)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson', alpha=0.7)

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p2, p1], [p2, p1], 'b-')

plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES -> Mol 변환 (파싱 실패 처리)
data["Mol"] = data["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", data["Mol"].isna().sum())
data = data.dropna(subset=["Mol"]).reset_index(drop=True)

# Morgan fingerprint generator 생성 (RDKit 권장 방식)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(
    radius=2,
    fpSize=1024
)

# Morgan fingerprint 계산 함수
def get_morgan_fingerprint(mol):
    return np.array(morgan_gen.GetFingerprint(mol))

# Descriptor 추출
data["MorganFingerprint"] = data["Mol"].apply(get_morgan_fingerprint)

# 입력 변수와 타겟 변수 분할
X = np.vstack(data["MorganFingerprint"].values)
y = data["Lipophilicity"].values

# 데이터 표준화
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# MLP 모델 학습
model = MLPRegressor(
    hidden_layer_sizes=(64, 64),
    max_iter=2000,
    random_state=42,
    early_stopping=True
)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson', alpha=0.7)

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p2, p1], [p2, p1], 'b-')

plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdFingerprintGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES -> Mol 변환 (파싱 실패 처리)
data["Mol"] = data["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", data["Mol"].isna().sum())
data = data.dropna(subset=["Mol"]).reset_index(drop=True)

# 기존 Descriptor 추출 (Mol 기반)
data["TPSA"] = data["Mol"].apply(Descriptors.TPSA)
data["LogP"] = data["Mol"].apply(Descriptors.MolLogP)
data["MolecularWeight"] = data["Mol"].apply(Descriptors.MolWt)
data["NumRotatableBonds"] = data["Mol"].apply(Descriptors.NumRotatableBonds)

# Morgan fingerprint generator (권장 방식)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

# Morgan fingerprint 추출 (Mol 기반)
data["MorganFingerprint"] = data["Mol"].apply(lambda m: np.array(morgan_gen.GetFingerprint(m)))

# Morgan fingerprint를 bit 컬럼으로 펼치기
morgan_df = pd.DataFrame(
    data["MorganFingerprint"].to_list(),
    columns=[f"bit{i}" for i in range(1024)]
)

# 입력 변수 만들기 (Descriptor + Morgan bits)
X = pd.concat(
    [data[["TPSA", "LogP", "MolecularWeight", "NumRotatableBonds"]], morgan_df],
    axis=1
)
y = data["Lipophilicity"].values

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 데이터 표준화 (train 기준)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# MLP 모델 학습
model = MLPRegressor(
    hidden_layer_sizes=(64, 64),
    max_iter=2000,
    random_state=42,
    early_stopping=True
)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 예측 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error :", mse)
print("R^2 :", r2)

# 예측값 vs 실제값 분포 그래프 (log 스케일 제거: 음수 가능)
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, c='crimson', alpha=0.7)

p1 = max(np.max(y_pred), np.max(y_test))
p2 = min(np.min(y_pred), np.min(y_test))
plt.plot([p2, p1], [p2, p1], 'b-')

plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdFingerprintGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Dense, Input, Concatenate, Conv1D, GlobalAveragePooling1D
from keras.utils import plot_model
import matplotlib.pyplot as plt
import tensorflow as tf

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES -> Mol 변환 (파싱 실패 처리)
data["Mol"] = data["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", data["Mol"].isna().sum())
data = data.dropna(subset=["Mol"]).reset_index(drop=True)

# 기존 Descriptor 추출 (Mol 기반)
data["TPSA"] = data["Mol"].apply(Descriptors.TPSA)
data["LogP"] = data["Mol"].apply(Descriptors.MolLogP)
data["MolecularWeight"] = data["Mol"].apply(Descriptors.MolWt)
data["NumRotatableBonds"] = data["Mol"].apply(Descriptors.NumRotatableBonds)

# Morgan fingerprint generator (권장 방식)
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

# MorganFingerprint 추출 (Mol 기반)
data["MorganFingerprint"] = data["Mol"].apply(lambda m: np.array(morgan_gen.GetFingerprint(m), dtype=np.float32))

# 입력 변수와 타겟 변수 분할
X_global = data[["TPSA", "LogP", "MolecularWeight", "NumRotatableBonds"]].values.astype(np.float32)
X_fingerprint = np.stack(data["MorganFingerprint"].values).astype(np.float32)
y = data["Lipophilicity"].values.astype(np.float32)

# 학습 데이터와 테스트 데이터 분할
X_global_train, X_global_test, X_fingerprint_train, X_fingerprint_test, y_train, y_test = train_test_split(
    X_global, X_fingerprint, y, test_size=0.2, random_state=42
)

# 데이터 표준화 (train 기준)
scaler_global = StandardScaler()
X_global_train = scaler_global.fit_transform(X_global_train).astype(np.float32)
X_global_test = scaler_global.transform(X_global_test).astype(np.float32)

scaler_fingerprint = StandardScaler()
X_fingerprint_train = scaler_fingerprint.fit_transform(X_fingerprint_train).astype(np.float32)
X_fingerprint_test = scaler_fingerprint.transform(X_fingerprint_test).astype(np.float32)

# Conv1D 입력 형태로 차원 추가
X_fingerprint_train = np.expand_dims(X_fingerprint_train, axis=2)
X_fingerprint_test = np.expand_dims(X_fingerprint_test, axis=2)

# 모델 구성
input_global = Input(shape=(X_global_train.shape[1],), name="global_input")
input_fingerprint = Input(shape=(X_fingerprint_train.shape[1], 1), name="fingerprint_input")

conv1 = Conv1D(16, 3, activation="relu")(input_fingerprint)
gap1 = GlobalAveragePooling1D()(conv1)

concat = Concatenate()([input_global, gap1])

dense1 = Dense(64, activation="relu")(concat)
output = Dense(1)(dense1)

model = Model(inputs=[input_global, input_fingerprint], outputs=output)

# 모델 컴파일
model.compile(optimizer="adam", loss="mean_squared_error")

# 모델 구조 시각화
plot_model(model, to_file="model_plot.png", show_shapes=True, show_layer_names=True)

# 모델 학습
history = model.fit(
    [X_global_train, X_fingerprint_train],
    y_train,
    validation_data=([X_global_test, X_fingerprint_test], y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)

# 모델 학습 과정 시각화
plt.figure(figsize=(12, 4))
plt.plot(history.history["loss"], label="loss")
plt.plot(history.history["val_loss"], label="val_loss")
plt.xlabel("Epoch")
plt.legend()
plt.show()

# 모델 평가
loss = model.evaluate([X_global_test, X_fingerprint_test], y_test, verbose=0)
print("Test Loss:", loss)

# 예측값 계산
y_pred = model.predict([X_global_test, X_fingerprint_test]).reshape(-1)

from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# 예측값과 실제값 비교 시각화 (log scale 제거)
plt.figure(figsize=(5, 5))
plt.scatter(y_test, y_pred, c="crimson", alpha=0.7)

p1 = max(np.max(y_pred), np.max(y_test))
p2 = min(np.min(y_pred), np.min(y_test))
plt.plot([p2, p1], [p2, p1], "b-")

plt.xlabel("True Values", fontsize=15)
plt.ylabel("Predictions", fontsize=15)
plt.axis("equal")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# SMILES 문자열을 추출
smiles_list = data["SMILES"].astype(str).values.tolist()

# SMILES 문자열을 토큰화하고 정수 인코딩
tokenizer = Tokenizer(char_level=True, lower=False, filters="")
tokenizer.fit_on_texts(smiles_list)
sequences = tokenizer.texts_to_sequences(smiles_list)

# 시퀀스 패딩
max_length = max(len(seq) for seq in sequences)
X_smiles = pad_sequences(sequences, maxlen=max_length, padding="post")

# Lipophilicity를 타겟 변수로 설정
y = data["Lipophilicity"].values.astype(np.float32)

# 트레이닝셋과 테스트셋 분리
X_train, X_test, y_train, y_test = train_test_split(
    X_smiles, y, test_size=0.2, random_state=42
)

# RNN 모델 구성 (원-핫 대신 Embedding 사용)
vocab_size = len(tokenizer.word_index) + 1

input_layer = Input(shape=(X_train.shape[1],))
embed = Embedding(input_dim=vocab_size, output_dim=32, mask_zero=True)(input_layer)

lstm_layer1 = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.0)(embed)
lstm_layer2 = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.0)(lstm_layer1)
lstm_layer3 = LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.0)(lstm_layer2)

output_layer = Dense(1, activation="linear")(lstm_layer3)

# 모델 컴파일
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(), loss="mean_squared_error")

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# 테스트셋에 대한 예측 수행
y_pred = model.predict(X_test).reshape(-1)

# 예측 결과 평가 (Mean Squared Error와 R2 Score)
mse = mean_squared_e


In [None]:
!pip install torch_geometric

In [None]:
import torch
from torch import nn
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 데이터 불러오기
df = pd.read_csv("Lipophilicity.csv")

# SMILES -> Mol 변환 (파싱 실패 처리)
df["Mol"] = df["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", df["Mol"].isna().sum())
df = df.dropna(subset=["Mol"]).reset_index(drop=True)

# rdkit Mol -> PyG Data 변환
def mol_to_data(mol, y_value):
    x = torch.tensor([[atom.GetAtomicNum()] for atom in mol.GetAtoms()], dtype=torch.float)

    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append([i, j])
        edges.append([j, i])

    if len(edges) == 0:
        edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    y = torch.tensor([y_value], dtype=torch.float)
    return Data(x=x, edge_index=edge_index, y=y)

# 그래프 데이터셋 만들기
dataset = [
    mol_to_data(mol, y_val)
    for mol, y_val in zip(df["Mol"].values, df["Lipophilicity"].values)
]

# 트레이닝셋과 테스트셋 분리
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 그래프 신경망(GNN) 모델 구성
class GNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.conv3 = GCNConv(64, 64)
        self.fc = nn.Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index).relu()
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return x.view(-1)

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GNN().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)

# 학습
for epoch in range(100):
    model.train()
    train_losses = []

    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()

        output = model(batch)              # (batch_size,)
        target = batch.y.view(-1)          # (batch_size,)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    if epoch % 10 == 0:
        model.eval()
        test_losses = []

        with torch.no_grad():
            for batch in test_loader:
                batch = batch.to(device)
                output = model(batch)
                target = batch.y.view(-1)
                loss = criterion(output, target)
                test_losses.append(loss.item())

        print(f"Epoch {epoch}:")
        print("Train Loss:", np.mean(train_losses))
        print("Test Loss:", np.mean(test_losses))

# 평가
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        output = model(batch).cpu().numpy().flatten()
        target = batch.y.view(-1).cpu().numpy().flatten()
        y_pred.append(output)
        y_true.append(target)

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

print("MSE: ", mean_squared_error(y_true, y_pred))
print("R2 score: ", r2_score(y_true, y_pred))

# 결과 시각화 (log scale 제거)
plt.figure(figsize=(8, 8))
plt.scatter(y_true, y_pred, c="crimson", alpha=0.7)

p1 = max(np.max(y_pred), np.max(y_true))
p2 = min(np.min(y_pred), np.min(y_true))
plt.plot([p2, p1], [p2, p1], "b-")

plt.xlabel("True Values", fontsize=15)
plt.ylabel("Predictions", fontsize=15)
plt.axis("equal")
plt.show()


In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.nn import Linear
from torch_geometric.utils import dense_to_sparse
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader  # 변경: deprecated 해결
from torch_geometric.nn import global_mean_pool, GATConv
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 데이터 불러오기
data = pd.read_csv('Lipophilicity.csv')

# SMILES -> Mol 변환 (파싱 실패 처리)
data["Mol"] = data["SMILES"].apply(Chem.MolFromSmiles)
print("파싱 실패 SMILES 개수 :", data["Mol"].isna().sum())
data = data.dropna(subset=["Mol"]).reset_index(drop=True)

# 원자와 결합의 최대 개수 설정
max_atoms = max([mol.GetNumAtoms() for mol in data["Mol"]])

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    features = [atom.GetAtomicNum() for atom in mol.GetAtoms()]

    # 패딩 수행
    n_atoms = len(features)
    features.extend([0] * (max_atoms - n_atoms))
    adj_padding = np.zeros((max_atoms, max_atoms))
    adj_padding[:n_atoms, :n_atoms] = adj

    return adj_padding, features

graphs = [smiles_to_graph(smiles) for smiles in data['SMILES']]
X = [torch.tensor(features, dtype=torch.float).view(-1, 1) for _, features in graphs]
A = [torch.tensor(adj, dtype=torch.long) for adj, _ in graphs]
y = torch.tensor(data['Lipophilicity'].values, dtype=torch.float).view(-1, 1)

# 글로벌 특성 추가 (Mol 기반)
data['TPSA'] = data['Mol'].apply(Descriptors.TPSA)
data['LogP'] = data['Mol'].apply(Descriptors.MolLogP)
data['MolecularWeight'] = data['Mol'].apply(Descriptors.MolWt)
data['NumRotatableBonds'] = data['Mol'].apply(Descriptors.NumRotatableBonds)

# 글로벌 특성 데이터를 텐서로 변환
global_features = torch.tensor(
    data[['TPSA', 'LogP', 'MolecularWeight','NumRotatableBonds']].values,
    dtype=torch.float
)

class GraphDataset(Dataset):
    def __init__(self, X, A, y, global_features):
        self.X = X
        self.A = A
        self.y = y
        self.global_features = global_features

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        edge_index, _ = dense_to_sparse(self.A[idx])
        return Data(
            x=self.X[idx],
            edge_index=edge_index,
            y=self.y[idx],
            global_features=self.global_features[idx]  # 그대로 유지
        )

# 트레이닝셋과 테스트셋 분리
X_train, X_test, A_train, A_test, y_train, y_test, global_train, global_test = train_test_split(
    X, A, y, global_features, test_size=0.2, random_state=42
)
train_data = GraphDataset(X_train, A_train, y_train, global_train)
test_data = GraphDataset(X_test, A_test, y_test, global_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# 그래프 신경망(GAT) 모델 구성
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GATConv(1, 64)
        self.conv2 = GATConv(64, 64)
        self.conv3 = GATConv(64, 64)
        self.global_feature_layer = Linear(4, 64)
        self.fc = Linear(128, 1)

    def forward(self, data):
        x, edge_index, global_features = data.x, data.edge_index, data.global_features
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.conv3(x, edge_index).relu()
        x = global_mean_pool(x, data.batch)

        global_features = global_features.view(-1, 4)  # 추가: 배치 후 shape 복원
        global_features = self.global_feature_layer(global_features).relu()

        x = torch.cat([x, global_features], dim=1)
        x = self.fc(x)
        return x.view(-1)

# 모델, 손실함수, 최적화 함수 설정
model = GNN()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

# 학습
for epoch in range(30):
    model.train()
    train_losses = []
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y.view(-1))
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    if epoch % 10 == 0:
        model.eval()
        test_losses = []
        with torch.no_grad():
            for batch in test_loader:
                output = model(batch)
                loss = criterion(output, batch.y.view(-1))
                test_losses.append(loss.item())

        print(f'Epoch {epoch}:')
        print('Train Loss:', np.mean(train_losses))
        print('Test Loss:', np.mean(test_losses))
        model.train()

# 평가
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for batch in test_loader:
        output = model(batch)
        y_pred.append(output.detach().numpy().flatten())
        y_true.append(batch.y.view(-1).numpy().flatten())

y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

print('MSE: ', mean_squared_error(y_true, y_pred))
print('R2 score: ', r2_score(y_true, y_pred))

# 결과 시각화 (log scale 제거)
plt.figure(figsize=(8, 8))
plt.scatter(y_true, y_pred, c='crimson', alpha=0.7)

p1 = max(np.max(y_pred), np.max(y_true))
p2 = min(np.min(y_pred), np.min(y_true))
plt.plot([p2, p1], [p2, p1], 'b-')

plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()
