### 주의 : 본 코드는 책에 대한 학습 및 교육외에 배포를 금지합니다.
### Warning: This code is prohibited from distribution except for learning and educational purposes related to the book.
3장 머신러닝과 화학특성
- by Keunhong Jeong

![image.png](attachment:image.png)

In [None]:
import requests

# Updated URL to point to the new file location
url = 'https://raw.githubusercontent.com/doas1min/CAIP/main/data/Lipophilicity_G2.csv'
response = requests.get(url)

# 파일을 저장합니다.
with open('Lipophilicity_test.csv', 'wb') as f:
    f.write(response.content)

In [None]:
import pandas as pd

# CSV 파일 읽기
data = pd.read_csv('Lipophilicity_test.csv')

# index 변경
data.set_index(['smiles', 'logD'], inplace=True)
data.index.names = ['SMILES', 'Lipophilicity']

# CSV 파일로 저장
data.to_csv('Lipophilicity.csv')


In [None]:
!pip install rdkit

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PolynomialFeatures 정의 (2차 다항식으로 변환)
poly_features = PolynomialFeatures(degree=2)

# 학습 데이터에 PolynomialFeatures 적용
X_train_poly = poly_features.fit_transform(X_train)

# 다항 회귀 모델 학습
model = LinearRegression()
model.fit(X_train_poly, y_train)

# 테스트 데이터에 대해 PolynomialFeatures 적용 후 예측 수행
X_test_poly = poly_features.transform(X_test)
y_pred = model.predict(X_test_poly)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ridge 회귀 모델 학습
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 변수 중요도
coefs = pd.Series(model.coef_, index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 회귀 계수 신뢰 구간 계산
X_train = sm.add_constant(X_train)
model_with_const = sm.OLS(y_train, X_train).fit()
coef_conf_int = model_with_const.conf_int()

plt.figure(figsize=(8, 6))
plt.errorbar(range(len(coefs)), coefs.values, yerr=np.abs(coef_conf_int.iloc[1:, :].values.T - coefs.values),
             fmt='o', color='blue', ecolor='gray', capsize=5)
plt.xticks(range(len(coefs)), coefs.index, rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient')
plt.title('Confidence Interval of Coefficients')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lasso 회귀 모델 학습
model = Lasso(alpha=1.0)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 변수 중요도
coefs = pd.Series(model.coef_, index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 회귀 계수 신뢰 구간 계산
X_train = sm.add_constant(X_train)
model_with_const = sm.OLS(y_train, X_train).fit()
coef_conf_int = model_with_const.conf_int()

plt.figure(figsize=(8, 6))
plt.errorbar(range(len(coefs)), coefs.values, yerr=np.abs(coef_conf_int.iloc[1:, :].values.T - coefs.values),
             fmt='o', color='blue', ecolor='gray', capsize=5)
plt.xticks(range(len(coefs)), coefs.index, rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient')
plt.title('Confidence Interval of Coefficients')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ElasticNet 회귀 모델 학습
model = ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 변수 중요도
coefs = pd.Series(model.coef_, index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 회귀 계수 신뢰 구간 계산
X_train = sm.add_constant(X_train)
model_with_const = sm.OLS(y_train, X_train).fit()
coef_conf_int = model_with_const.conf_int()

plt.figure(figsize=(8, 6))
plt.errorbar(range(len(coefs)), coefs.values, yerr=np.abs(coef_conf_int.iloc[1:, :].values.T - coefs.values),
             fmt='o', color='blue', ecolor='gray', capsize=5)
plt.xticks(range(len(coefs)), coefs.index, rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient')
plt.title('Confidence Interval of Coefficients')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=np.arange(5))

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
coefs = pd.Series(model.coef_[0], index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values, palette='coolwarm')
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree 회귀 모델 학습
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 변수 중요도
coefs = pd.Series(model.feature_importances_, index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest 회귀 모델 학습
model = RandomForestRegressor()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 변수 중요도
coefs = pd.Series(model.feature_importances_, index=X.columns)
sorted_coefs = coefs.abs().sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=sorted_coefs.index, y=sorted_coefs.values)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Coefficient Magnitude')
plt.title('Feature Importance')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM 회귀 모델 학습
model = SVR()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-최근접 이웃 회귀 모델 학습
model = KNeighborsRegressor()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = data['Lipophilicity']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 회귀 모델 학습
model = XGBRegressor()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# 잔차 계산
residuals = y_test - y_pred

# 데이터 분포
plt.figure(figsize=(8, 6))
sns.histplot(data['Lipophilicity'], kde=True)
plt.xlabel('Lipophilicity')
plt.ylabel('Frequency')
plt.title('Distribution of Lipophilicity')
plt.show()

# 회귀 결과
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Lipophilicity')
plt.ylabel('Predicted Lipophilicity')
plt.title('Actual vs Predicted Lipophilicity')
plt.show()

# 잔차 분석
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.show()

# 다중공선성 분석
corr_matrix = X.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가 지표 출력
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=np.arange(5))

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 서포트 벡터 머신 모델 학습
model = SVC()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=np.arange(5))

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 분류 모델 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=np.arange(5))

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gradient Boosting 분류 모델 학습
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=False)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 분류 모델 학습
model = XGBClassifier()
#model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=False)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 분류 모델 학습
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=False)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 분류 모델 학습
model = LGBMClassifier()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()


In [None]:
!pip install catboost

In [None]:
!pip install catboost
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 데이터 불러오기
data = pd.read_csv("Lipophilicity.csv")

# Descriptor 추출
data['TPSA'] = data['SMILES'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
data['LogP'] = data['SMILES'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
data['MolecularWeight'] = data['SMILES'].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
data['NumRotatableBonds'] = data['SMILES'].apply(lambda x: Descriptors.NumRotatableBonds(Chem.MolFromSmiles(x)))

# Lipophilicity2.csv로 저장
data.to_csv("Lipophilicity2.csv", index=False)

# 입력 변수와 타겟 변수 분할
X = data[['TPSA', 'LogP', 'MolecularWeight', 'NumRotatableBonds']]
y = pd.qcut(data['Lipophilicity'], q=5, labels=False)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost 분류 모델 학습
model = CatBoostClassifier(verbose=0)
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='coolwarm')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# 분류 결과 분포
sns.countplot(x=y, palette='coolwarm')
plt.show()

# 변수 중요도
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 6))
sns.barplot(x=X.columns[indices], y=importances[indices])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()
