In [1]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, precision_score
from sklearn.model_selection import train_test_split
import os
import dataframe_image as dfi


In [2]:
current = os.getcwd()
root = os.path.dirname(current)
path = 'data'
filename = 'cleaned_data_train.csv'

file = os.path.join(root,path,filename)
file

'/Users/cynthiaxu/Documents/MIDS/05_2024 Summer/210 Capstone/210-capstone-clinicaltrials/data/cleaned_data_train.csv'

In [3]:
# Load the dataset
data = pd.read_csv(file)

# Load the trained Random Forest model
model_path = 'model_rf.pkl'
rf_model = joblib.load(model_path)

# Assuming the target column is 'target'
exclude_columns = [
    'protocolSection_identificationModule_nctId',
    'study_eq_labels', 
    'study_duration_days', 
    'primary_eq_labels', 
    'primary_study_duration_days',
    'primary_eq_bins',
    'study_eq_bins']
target = 'study_eq_labels'
X = data.drop(columns=exclude_columns)
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize a DataFrame to store the results
results = pd.DataFrame(columns=['feature_dropped', 'accuracy', 'precision', 'mean_squared_error', 'mean_absolute_error'])

# Get the list of features
features = X.columns.tolist()

for feature in features:
    # Drop the feature
    X_train_dropped = X_train.drop(columns=[feature])
    X_test_dropped = X_test.drop(columns=[feature])

    # Retrain the model
    rf_model.fit(X_train_dropped, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test_dropped)
    # y_pred_proba = rf_model.predict_proba(X_test_dropped)[:, 1]

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred, average='weighted')
    mae = mean_absolute_error(y_test, y_pred)
    # roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

    row = {
        'feature_dropped': feature,
        'accuracy': accuracy,
        'precision': pre,
        'mean_squared_error': mse,
        'mean_absolute_error': mae
    }
    # df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    # Store the results
    results = pd.concat([results, pd.DataFrame([row])], ignore_index=True)

results


  results = pd.concat([results, pd.DataFrame([row])], ignore_index=True)


Unnamed: 0,feature_dropped,accuracy,precision,mean_squared_error,mean_absolute_error
0,number_of_conditions,0.561404,0.567765,0.649123,0.508772
1,number_of_groups,0.574561,0.585279,0.622807,0.491228
2,age_group,0.548246,0.561289,0.622807,0.508772
3,num_locations,0.570175,0.565109,0.653509,0.504386
4,location,0.561404,0.566156,0.609649,0.495614
5,num_inclusion,0.54386,0.541778,0.679825,0.530702
6,num_exclusion,0.552632,0.553227,0.644737,0.513158
7,number_of_intervention_types,0.52193,0.528678,0.714912,0.557018
8,sponsor_type,0.574561,0.587887,0.635965,0.495614
9,intervention_model,0.557018,0.56063,0.640351,0.508772


In [5]:
results = results.sort_values(by='accuracy', ascending=False)
styled_df = results.style.background_gradient(cmap='Greens') \
                 .set_caption('Feature Importance')
styled_df


Unnamed: 0,feature_dropped,accuracy,precision,mean_squared_error,mean_absolute_error
31,max_treatment_duration,0.587719,0.591503,0.570175,0.464912
18,prevention_purpose,0.583333,0.583845,0.627193,0.486842
24,radiation_intervention,0.583333,0.581912,0.640351,0.491228
25,biological_intervention,0.574561,0.573942,0.622807,0.491228
8,sponsor_type,0.574561,0.587887,0.635965,0.495614
1,number_of_groups,0.574561,0.585279,0.622807,0.491228
3,num_locations,0.570175,0.565109,0.653509,0.504386
17,diagnostic_purpose,0.565789,0.57458,0.592105,0.486842
22,behavioral_intervention,0.565789,0.566581,0.657895,0.508772
19,supportive_purpose,0.565789,0.562413,0.631579,0.5


In [6]:
current = os.getcwd()
parent = os.path.dirname(current)

save_path = os.path.join(parent, 'figures', 'feature_ablation.png')
save_path

'/Users/cynthiaxu/Documents/MIDS/05_2024 Summer/210 Capstone/210-capstone-clinicaltrials/figures/feature_ablation.png'

In [7]:
dfi.export(styled_df, save_path)