In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
merged_df = pd.read_parquet('training_small_fingers_set.parquet')

In [None]:
column_types = merged_df.dtypes


In [None]:
column_types


molecule_smiles       object
binds                  int64
test_u                  bool
test1                   bool
test2                   bool
molecular_size         int64
ecfp                  object
maccs                 object
RDKFingerprint        object
Torsion               object
Avalon                object
protein_name_BRD4    float64
protein_name_HSA     float64
protein_name_sEH     float64
ecpf_unpacked         object
dtype: object

In [6]:
import swifter

# Step 3: Unpacking the Avalon data
def unpack_avalon(encoded_avalon):
    return pickle.loads(encoded_avalon)

merged_df['ecpf_unpacked'] = merged_df['ecfp'].swifter.apply(unpack_avalon)

Pandas Apply: 100%|██████████| 2020481/2020481 [01:45<00:00, 19091.81it/s]


In [11]:
avalon_features = pd.DataFrame(merged_df['ecpf_unpacked'].tolist())
avalon_features.columns = ['RDK_feature_' + str(i) for i in tqdm(range(avalon_features.shape[1]))]

KeyboardInterrupt: 

In [3]:
import swifter

# Step 3: Unpacking the Avalon data
def unpack_avalon(encoded_avalon):
    return pickle.loads(encoded_avalon)

# Apply the unpack function to the Avalon column
merged_df['ecpf_unpacked'] = merged_df['ecfp'].swifter.apply(unpack_avalon)

# Optionally, convert boolean lists into separate feature columns or handle as appropriate
# Example of expansion into separate columns if necessary
# This part depends on the exact structure and needs of your model training
avalon_features = pd.DataFrame(merged_df['ecpf_unpacked'].tolist())
avalon_features.columns = ['RDK_feature_' + str(i) for i in tqdm(range(avalon_features.shape[1]))]
merged_df = pd.concat([merged_df, avalon_features.astype('int8')], axis=1).drop('ecfp', axis=1)

train = merged_df[(merged_df['test_u']==False)&(merged_df['test1']==False)&(merged_df['test2']==False)]
test = merged_df[(merged_df['test_u']==True)]
test1 = merged_df[(merged_df['test1']==True)]


Pandas Apply: 100%|██████████| 2020481/2020481 [01:58<00:00, 17045.28it/s]
100%|██████████| 2048/2048 [00:00<00:00, 2758488.95it/s]


In [4]:
# Step 4: Prepare the final DataFrame for XGBoost
# Assuming 'binds' is the label
X_train = train.drop(['binds','molecule_smiles','test_u','test1','test2','ecpf_unpacked','maccs','Avalon','RDKFingerprint', 'Torsion'], axis=1)  # features
y_train = train['binds']  # labels

X_test = test.drop(['binds','molecule_smiles','test_u','test1','test2','ecpf_unpacked','maccs','Avalon','RDKFingerprint', 'Torsion'], axis=1)  # features
y_test = test['binds']  # labels

X_test1 = test1.drop(['binds','molecule_smiles','test_u','test1','test2','ecpf_unpacked','maccs','Avalon','RDKFingerprint', 'Torsion'], axis=1)  # features
y_test1 = test1['binds']  # labels

In [5]:
# Assuming X_train is already defined and is a pandas DataFrame
total_memory_bytes = X_train.memory_usage(deep=True).sum()  # Calculate total memory usage in bytes
total_memory_gb = total_memory_bytes / (1024 ** 3)  # Convert bytes to gigabytes

print(f"The size of X_train in memory is approximately {total_memory_gb:.3f} GB")

The size of X_train in memory is approximately 3.231 GB


In [6]:
X_train

Unnamed: 0,molecular_size,protein_name_BRD4,protein_name_HSA,protein_name_sEH,RDK_feature_0,RDK_feature_1,RDK_feature_2,RDK_feature_3,RDK_feature_4,RDK_feature_5,...,RDK_feature_2038,RDK_feature_2039,RDK_feature_2040,RDK_feature_2041,RDK_feature_2042,RDK_feature_2043,RDK_feature_2044,RDK_feature_2045,RDK_feature_2046,RDK_feature_2047
2,72,0.0,0.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,63,1.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,71,0.0,0.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,69,0.0,0.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,78,1.0,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020475,61,0.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020477,58,0.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020478,65,0.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020479,73,0.0,0.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
import multiprocessing
def map_micro(preds, dtrain):
    labels = dtrain.get_label()
    # Calculate the MAP score
    score = average_precision_score(labels, preds, average='micro')
    return 'map_micro', score

# Parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    #'device': 'cuda',
    'tree_method': 'hist',  # Utilize GPU for histogram construction
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_jobs': multiprocessing.cpu_count() // 2,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest1 = xgb.DMatrix(X_test1, label=y_test1)

# Setting a very large number for num_boost_round and using early stopping
num_boost_round = 1000000  # A large number, effectively "infinite"
early_stopping_rounds = 500  # Stops training if no improvement in 50 rounds
evals_result = {}
# Train model
model = xgb.train(params, dtrain, evals=[(dtrain, 'train'), (dtest, 'test'), (dtest1, 'test1')], 
                  num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds,
                  evals_result=evals_result, custom_metric=map_micro,
                  maximize=True)  # Since MAP is higher the better

# Best iteration and best score
best_iteration = model.best_iteration
best_score = model.best_score
print(f"Best iteration: {best_iteration}, Best AUCPR: {best_score}")

# Save the model
model.save_model('xgboost_model.bin')


# Plotting the learning curve for AUCPR
train_aucpr = evals_result['train']['map_micro']
test_aucpr = evals_result['test']['map_micro']
test_aucpr1 = evals_result['test1']['map_micro']
plt.figure(figsize=(10, 5))
plt.plot(train_aucpr, label='Train map_micro')
plt.plot(test_aucpr, label='Test map_micro')
plt.plot(test_aucpr1, label='Test1 map_micro')
plt.title('XGBoost map_micro Learning Curve')
plt.xlabel('Number of Boosting Rounds')
plt.ylabel('map_micro')
plt.legend()
plt.show()

[0]	train-auc:0.73540	train-map_micro:0.26521	test-auc:0.64010	test-map_micro:0.03436	test1-auc:0.72010	test1-map_micro:0.18916
[1]	train-auc:0.73538	train-map_micro:0.27335	test-auc:0.63991	test-map_micro:0.03445	test1-auc:0.71874	test1-map_micro:0.19688
[2]	train-auc:0.73614	train-map_micro:0.28392	test-auc:0.63986	test-map_micro:0.03440	test1-auc:0.71883	test1-map_micro:0.19805
[3]	train-auc:0.73614	train-map_micro:0.28953	test-auc:0.63920	test-map_micro:0.03572	test1-auc:0.71806	test1-map_micro:0.20059
[4]	train-auc:0.75385	train-map_micro:0.29500	test-auc:0.64999	test-map_micro:0.09372	test1-auc:0.73418	test1-map_micro:0.24153
[5]	train-auc:0.75405	train-map_micro:0.29811	test-auc:0.64988	test-map_micro:0.09513	test1-auc:0.73413	test1-map_micro:0.24222
[6]	train-auc:0.76235	train-map_micro:0.30500	test-auc:0.65186	test-map_micro:0.09147	test1-auc:0.74070	test1-map_micro:0.24962
[7]	train-auc:0.76226	train-map_micro:0.30732	test-auc:0.65177	test-map_micro:0.08985	test1-auc:0.74050	

KeyboardInterrupt: 