In [1]:
import numpy as np
from sklearn.metrics import mean_absolute_error
import pandas as pd
import os

import xgboost as xgb

FOLDER_DIR = "./preprocessed/"

In [6]:
train_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_df.csv'))
test_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_df.csv'))


train_angles_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_angles_df.csv')).drop(["shortest_path_atoms"], axis=1)
test_angles_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_angles_df.csv')).drop(["shortest_path_atoms"], axis=1)


train_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'train_bonds.csv'))
test_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'test_bonds.csv'))
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])
train_bonds = train_bonds.drop(["nbond","L2dist","error", "bond_type"], axis=1)
test_bonds = test_bonds.drop(["nbond","L2dist","error", "bond_type"], axis=1)

train_bonds["atom_index_0"], train_bonds["atom_index_1"] = train_bonds["atom_index_1"], train_bonds["atom_index_0"]
test_bonds["atom_index_0"], test_bonds["atom_index_1"] = test_bonds["atom_index_1"], test_bonds["atom_index_0"]


train_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_structures_df.csv')).drop(["x", "y", "z"], axis=1)
test_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_structures_df.csv')).drop(["x", "y", "z"], axis=1)

In [7]:
len(train_df), len(train_angles_df), len(train_bonds), len(train_structures_df)

(3724011, 3724007, 1268468, 1226165)

In [38]:
merged_train_df = pd.merge(train_df, train_angles_df, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
merged_train_df = pd.merge(merged_train_df, train_bonds, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
merged_train_df = pd.merge(merged_train_df, train_structures_df, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'])
merged_train_df = pd.merge(merged_train_df, train_structures_df, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=["_atom_0", "_atom_1"])
Y_train = merged_train_df["scalar_coupling_constant"]
merged_train_df = merged_train_df.drop(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', "x0", "x1", "y0", "y1", "z0", "z1", 'scalar_coupling_constant', 'dist_x', 'dist_y', 'dist_z', 'atom_index_atom_0', 'atom_index_atom_1'], axis=1)
X_train = merged_train_df.fillna(0.).replace({False: 0., True: 1.})

merged_test_df = pd.merge(test_df, test_angles_df, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
merged_test_df = pd.merge(merged_test_df, test_bonds, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
merged_test_df = pd.merge(merged_test_df, test_structures_df, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'])
merged_test_df = pd.merge(merged_test_df, test_structures_df, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'], suffixes=["_atom_0", "_atom_1"])
Y_test = merged_test_df["scalar_coupling_constant"]
merged_test_df = merged_test_df.drop(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', "x0", "x1", "y0", "y1", "z0", "z1", 'scalar_coupling_constant', 'dist_x', 'dist_y', 'dist_z', 'atom_index_atom_0', 'atom_index_atom_1'], axis=1)
X_test = merged_test_df.fillna(0.).replace({False: 0., True: 1.})

In [49]:
X_train

Unnamed: 0,dist,1JHC,1JHN,2JHC,2JHH,2JHN,3JHC,3JHH,3JHN,shortest_path_n_bonds,...,C_atom_0,F_atom_0,H_atom_0,N_atom_0,O_atom_0,C_atom_1,F_atom_1,H_atom_1,N_atom_1,O_atom_1
0,1.091953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.091952,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.091946,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.091948,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.783120,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3724006,2.129037,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3724007,2.743605,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3724008,2.742675,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3724009,2.743607,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### XGBoost regressor

In [39]:
def log_mae(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    log_mae = np.log(mae)
    return log_mae

In [72]:
xg_reg = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    n_estimators=200,
    max_depth=10,
    colsample_bytree=0.7,
    device="cuda",
)

xg_reg.fit(X_train, Y_train)

y_pred = xg_reg.predict(X_test)

score = log_mae(Y_test, y_pred)
print(f"log MAE: {score}")

log MAE: 0.675433760384279


We can try to find better hyperparameters with a grid search.

In [59]:
from sklearn.model_selection import GridSearchCV

# Create a dictionary of hyperparameters to search
param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 5, 10,],
}

xg_reg = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    device="cuda",
)

# Setup GridSearchCV
grid_cv = GridSearchCV(xg_reg, param_grid, verbose=1, cv=3, n_jobs=-1)

grid_cv.fit(X_train, Y_train)

# Best parameters and best score
print(f"Best parameters: {grid_cv.best_params_}")

# Use the best estimator to make predictions
y_pred = grid_cv.best_estimator_.predict(X_test)

score = log_mae(Y_test, y_pred)
print(f"log MAE: {score}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device or

Best parameters: {'colsample_bytree': 0.7, 'max_depth': 10, 'n_estimators': 200}
log MAE: 0.675433760384279
