In [1]:
from train_model import *


Model training complete.


In [2]:
current = os.getcwd()
root = os.path.dirname(current)
path = 'data'
train_file = 'cleaned_data_train.csv'
test_file = 'cleaned_data_test.csv'

train_path = os.path.join(root,path,train_file)
test_path = os.path.join(root,path,test_file)

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [8]:
def make_baseline(train_data, test_data, features, target):

    # same model parameters as production model
    model =  RandomForestClassifier(max_depth=None,
                                        min_samples_leaf=1,
                                        min_samples_split=2,
                                        n_estimators=100)
    
    # train the model
    X = train_data[features]
    y = train_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # Perform k-fold cross-validation
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

    model.fit(X_train, y_train)

    # Predict probabilities for each class
    y_pred_prob = model.predict_proba(X_test)
    # Convert probabilities to class labels
    y_pred = np.argmax(y_pred_prob, axis=1) + 1

    mse = mean_squared_error(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred, average='weighted')
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    train_row = {
        'model': 'baseline',
        'features': features,
        'set': 'training',
        'accuracy': acc,
        'mean_accuracy_kfold': cv_results.mean(),
        'precision': pre,
        'r_squared': r2,
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
    }

    # print(f"Training: {row}")

    test_X = test_data[features]
    test_y = test_data[target]

    y_pred_prob = model.predict_proba(test_X)
    # Convert probabilities to class labels
    y_pred = np.argmax(y_pred_prob, axis=1) + 1

    # y_pred = model.predict(X)


    mse = mean_squared_error(test_y, y_pred)
    acc = accuracy_score(test_y, y_pred)
    pre = precision_score(test_y, y_pred, average='weighted')
    mae = mean_absolute_error(test_y, y_pred)
    r2 = r2_score(test_y, y_pred)

    test_row = {
        'model': 'baseline',
        'features': features,
        'set': 'test',
        'accuracy': acc,
        'precision': pre,
        'r_squared': r2,
        'mean_squared_error': mse,
        'mean_absolute_error': mae
        }
    
    # print(f"Testing: {row}")
    return train_row, test_row



In [9]:
combined_df = pd.DataFrame()

dict1, dict2 = make_baseline(train_data=train_data, test_data=test_data, features=['enroll_count'], target='study_eq_labels')

df1 = pd.DataFrame([dict1])
df2 = pd.DataFrame([dict2])

# Concatenate the DataFrames
combined_df = pd.concat([combined_df, df1, df2], ignore_index=True)
combined_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,model,features,set,accuracy,mean_accuracy_kfold,precision,r_squared,mean_squared_error,mean_absolute_error
0,baseline,[enroll_count],training,0.304124,0.420182,0.299282,-1.826265,1.798969,1.015464
1,baseline,[enroll_count],test,0.245192,,0.235417,-2.32135,2.084135,1.137019


In [15]:
train_data.columns

Index(['protocolSection_identificationModule_nctId',
       'primary_study_duration_days', 'study_duration_days', 'primary_eq_bins',
       'study_eq_bins', 'study_eq_labels', 'primary_eq_labels',
       'number_of_conditions', 'number_of_groups', 'age_group',
       'num_locations', 'location', 'num_inclusion', 'num_exclusion',
       'number_of_intervention_types', 'sponsor_type', 'intervention_model',
       'resp_party', 'has_dmc', 'allocation', 'masking', 'enroll_count',
       'healthy_vol', 'treatment_purpose', 'diagnostic_purpose',
       'prevention_purpose', 'supportive_purpose', 'procedure_intervention',
       'device_intervention', 'behavioral_intervention', 'drug_intervention',
       'radiation_intervention', 'biological_intervention',
       'os_outcome_measure', 'dor_outcome_measure', 'ae_outcome_measure',
       'primary_max_days', 'secondary_max_days', 'max_treatment_duration',
       'min_treatment_duration', 'survival_5yr_relative',
       'phase_PHASE2_PHASE3', 'p

In [20]:
all = [i for i in train_data.columns]

exclude_columns = [
    "protocolSection_identificationModule_nctId",
    "study_eq_labels",
    "study_duration_days",
    "primary_eq_labels",
    "primary_study_duration_days",
    "primary_eq_bins",
    "study_eq_bins",
]

allfeatures = [i for i in all if i not in exclude_columns]

# aka ever feature engineered that was not just a simple mapping from string to int
engr_features = [
    "num_inclusion",
    "num_exclusion",
    "primary_max_days",
    "secondary_max_days",
    "survival_5yr_relative",
    "max_treatment_duration",
    "min_treatment_duration",
    "os_outcome_measure",
    "dor_outcome_measure",
    "ae_outcome_measure",
    "treatment_purpose",
    "diagnostic_purpose",
    "prevention_purpose",
    "supportive_purpose",
    "procedure_intervention",
    "device_intervention",
    "behavioral_intervention",
    "drug_intervention",
    "radiation_intervention",
    "biological_intervention",
    "number_of_conditions",
    "number_of_groups",
    "number_of_intervention_types",
    "num_locations"
]

In [21]:
basic_features = [i for i in allfeatures if i not in engr_features]
basic_features

['age_group',
 'location',
 'sponsor_type',
 'intervention_model',
 'resp_party',
 'has_dmc',
 'allocation',
 'masking',
 'enroll_count',
 'healthy_vol',
 'phase_PHASE2_PHASE3',
 'phase_PHASE3']

In [22]:
dict3, dict4 = make_baseline(train_data=train_data, test_data=test_data, features=basic_features, target='study_eq_labels')

df3 = pd.DataFrame([dict3])
df4 = pd.DataFrame([dict4])

# Concatenate the DataFrames
combined_df = pd.concat([combined_df, df3, df4], ignore_index=True)
combined_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,model,features,set,accuracy,mean_accuracy_kfold,precision,r_squared,mean_squared_error,mean_absolute_error
0,baseline,[enroll_count],training,0.304124,0.420182,0.299282,-1.826265,1.798969,1.015464
1,baseline,[enroll_count],test,0.245192,,0.235417,-2.32135,2.084135,1.137019
2,baseline,"[age_group, location, sponsor_type, interventi...",training,0.159794,0.429115,0.16461,-1.7598,1.835052,1.14433
3,baseline,"[age_group, location, sponsor_type, interventi...",test,0.211538,,0.202596,-1.892294,1.814904,1.103365
