Trying to predict if a pair of compounds is a MAIN pair

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/pairs_final_RPAIRS_smiles.csv', index_col=0)

# keep rows with 'RPAIR_main' !=2
df_train = df[df['RPAIR_main'] != 2]
df_test = df[df['RPAIR_main'] == 2]

print(f'Train: {df_train.shape}, Test: {df_test.shape} \n')

# count values in 'RPAIR_main'
print('RPAIR value counts:', df_train['RPAIR_main'].value_counts(normalize=True).round(2))

In [None]:
# split df_train to X and y
X = df_train.drop('RPAIR_main', axis=1)
y = df_train['RPAIR_main'].copy()

# split df_train into train and validation
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

def forest(X_train, y_train, X_val, y_val):
    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    rf_clf = RandomForestClassifier(random_state=42)
    rf_random = RandomizedSearchCV(estimator=rf_clf, param_distributions=param_grid, n_iter=10, cv=3, verbose=1,
                                    random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)

    # evaluate the best model
    best_model = rf_random.best_estimator_
    y_pred = best_model.predict(X_val)
    accuracy_score(y_val, y_pred)

    # get confusion matrix
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns

    # print accuracy score
    print(f'Accuracy score: {accuracy_score(y_val, y_pred)}')

    sns.set(font_scale=1.5)
    sns.set_style('whitegrid')
    plt.figure(figsize=(4, 4))
    sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    return best_model

def xgboost(X_train, y_train, X_val, y_val):
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
        'colsample_bytree': [0.3, 0.4, 0.5, 0.7]
    }

    xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)
    xgb_random = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, n_iter=50, cv=3, verbose=1,
                                    random_state=42, n_jobs=-1)
    xgb_random.fit(X_train, y_train)

    # evaluate the best model
    best_model = xgb_random.best_estimator_
    y_pred = best_model.predict(X_val)
    accuracy_score(y_val, y_pred)

    # get confusion matrix
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns

    # print accuracy score
    print(f'Accuracy score: {accuracy_score(y_val, y_pred)}')

    sns.set(font_scale=1.5)
    sns.set_style('whitegrid')
    plt.figure(figsize=(4, 4))
    sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    return best_model

# best_model = xgboost(X_train, y_train, X_val, y_val)
best_model = forest(X_train, y_train, X_val, y_val)

In [None]:
# predict on test set
y_test = best_model.predict(df_test.drop('RPAIR_main', axis=1))
df_test['RPAIR_main'] = y_test
display(df_test.head())
print(df_test['RPAIR_main'].value_counts(normalize=True).round(2))

In [None]:
# concat df_train and df_test to df_final and sort by index
df_final = pd.concat([df_train, df_test]).sort_index()

print(df_final.shape, df.shape)

In [None]:
# read 'pairs_final_RPAIRS.csv'
pairs = pd.read_csv('../data/pairs_final_RPAIRS.csv', index_col=0)

# add 'RPAIR_main' column to 'pairs_final_RPAIRS.csv'
pairs['RPAIR_main_pred'] = df_final['RPAIR_main'].values
pairs.head()

# save 'pairs_final_RPAIRS.csv'
pairs.to_csv('../data/pairs_final_RPAIRS_pred.csv')