In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('/Users/adelinechin/210/cleaned_data_train.csv')

## Processing

In [6]:
df = df.drop(columns=['primary_study_duration_days', 'study_duration_days', 'phase_PHASE2_ PHASE3', 'phase_PHASE3'])

In [7]:
# one-hot encode nominal features (age_group, location, resp_party, intervention_model, masking)
df['youth'] = df['age_group'].apply(lambda x: 1 if x == 0 or x == 2 else 0)
df['adult'] = df['age_group'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
df['us_loc'] = df['location'].apply(lambda x: 1 if x==0 or x==2 else 0)
df['non_us_loc'] = df['location'].apply(lambda x: 1 if x==1 or x==1 else 0)
df['pi_resp'] = df['resp_party'].apply(lambda x: 1 if x==0 or x==2 else 0)
df['sponsor_resp'] = df['resp_party'].apply(lambda x: 1 if x==1 or x==2 else 0)
df['single_intervention'] = df['intervention_model'].apply(lambda x: 1 if x==0 else 0)
df['parallel_intervention'] = df['intervention_model'].apply(lambda x: 1 if x==1 else 0)
df['other_intervention'] = df['intervention_model'].apply(lambda x: 1 if x==2 else 0)
df['no_mask'] = df['masking'].apply(lambda x: 1 if x==0 else 0)
df['single_mask'] = df['masking'].apply(lambda x: 1 if x==1 else 0)
df['double_mask'] = df['masking'].apply(lambda x: 1 if x==2 else 0)
df['triple_mask'] = df['masking'].apply(lambda x: 1 if x==3 else 0)
df['quadruple_mask'] = df['masking'].apply(lambda x: 1 if x==4 else 0)

In [8]:
# drop original columns
df = df.drop(columns=['age_group', 'location', 'resp_party', 'intervention_model', 'masking', 'protocolSection_identificationModule_nctId'])

In [12]:
df.columns

Index(['study_eq_labels', 'number_of_conditions', 'number_of_groups',
       'num_locations', 'num_inclusion', 'num_exclusion',
       'number_of_intervention_types', 'sponsor_type', 'has_dmc', 'allocation',
       'enroll_count', 'healthy_vol', 'treatment_purpose',
       'diagnostic_purpose', 'prevention_purpose', 'supportive_purpose',
       'procedure_intervention', 'device_intervention',
       'behavioral_intervention', 'drug_intervention',
       'radiation_intervention', 'biological_intervention',
       'os_outcome_measure', 'dor_outcome_measure', 'ae_outcome_measure',
       'primary_max_days', 'secondary_max_days', 'max_treatment_duration',
       'min_treatment_duration', 'survival_5yr_relative',
       'conditions_category_num', 'youth', 'adult', 'us_loc', 'non_us_loc',
       'pi_resp', 'sponsor_resp', 'single_intervention',
       'parallel_intervention', 'other_intervention', 'no_mask', 'single_mask',
       'double_mask', 'triple_mask', 'quadruple_mask'],
      dtype='

In [15]:
# replace NAs with 0
df['num_inclusion'] = df['num_inclusion'].apply(lambda x: 0 if x is None else x)
df['num_exclusion'] = df['num_exclusion'].apply(lambda x: 0 if x is None else x)
df['primary_max_days'] = df['primary_max_days'].apply(lambda x: 0 if x is None else x)
df['secondary_max_days'] = df['secondary_max_days'].apply(lambda x: 0 if x is None else x)

In [17]:
# process test file
test_df = pd.read_csv('/Users/adelinechin/210/cleaned_data_test.csv')

In [18]:
test_df = test_df.drop(columns=['primary_study_duration_days', 'study_duration_days', 'phase_PHASE2_ PHASE3', 'phase_PHASE3'])
test_df['youth'] = test_df['age_group'].apply(lambda x: 1 if x == 0 or x == 2 else 0)
test_df['adult'] = test_df['age_group'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
test_df['us_loc'] = test_df['location'].apply(lambda x: 1 if x==0 or x==2 else 0)
test_df['non_us_loc'] = test_df['location'].apply(lambda x: 1 if x==1 or x==1 else 0)
test_df['pi_resp'] = test_df['resp_party'].apply(lambda x: 1 if x==0 or x==2 else 0)
test_df['sponsor_resp'] = test_df['resp_party'].apply(lambda x: 1 if x==1 or x==2 else 0)
test_df['single_intervention'] = test_df['intervention_model'].apply(lambda x: 1 if x==0 else 0)
test_df['parallel_intervention'] = test_df['intervention_model'].apply(lambda x: 1 if x==1 else 0)
test_df['other_intervention'] = test_df['intervention_model'].apply(lambda x: 1 if x==2 else 0)
test_df['no_mask'] = test_df['masking'].apply(lambda x: 1 if x==0 else 0)
test_df['single_mask'] = test_df['masking'].apply(lambda x: 1 if x==1 else 0)
test_df['double_mask'] = test_df['masking'].apply(lambda x: 1 if x==2 else 0)
test_df['triple_mask'] = test_df['masking'].apply(lambda x: 1 if x==3 else 0)
test_df['quadruple_mask'] = test_df['masking'].apply(lambda x: 1 if x==4 else 0)

test_df = test_df.drop(columns=['age_group', 'location', 'resp_party', 'intervention_model', 'masking', 'protocolSection_identificationModule_nctId'])

test_df['num_inclusion'] = test_df['num_inclusion'].apply(lambda x: 0 if x is None else x)
test_df['num_exclusion'] = test_df['num_exclusion'].apply(lambda x: 0 if x is None else x)
test_df['primary_max_days'] = test_df['primary_max_days'].apply(lambda x: 0 if x is None else x)
test_df['secondary_max_days'] = test_df['secondary_max_days'].apply(lambda x: 0 if x is None else x)

In [None]:
df = df.dropna()
test_df = test_df.dropna()

In [None]:
X_train = df[['number_of_conditions', 'number_of_groups',
       'num_locations', 'num_inclusion', 'num_exclusion',
       'number_of_intervention_types', 'sponsor_type', 'has_dmc', 'allocation',
       'enroll_count', 'healthy_vol', 'treatment_purpose',
       'diagnostic_purpose', 'prevention_purpose', 'supportive_purpose',
       'procedure_intervention', 'device_intervention',
       'behavioral_intervention', 'drug_intervention',
       'radiation_intervention', 'biological_intervention',
       'os_outcome_measure', 'dor_outcome_measure', 'ae_outcome_measure',
       'primary_max_days', 'secondary_max_days', 'max_treatment_duration',
       'min_treatment_duration', 'survival_5yr_relative',
       'conditions_category_num', 'youth', 'adult', 'us_loc', 'non_us_loc',
       'pi_resp', 'sponsor_resp', 'single_intervention',
       'parallel_intervention', 'other_intervention', 'no_mask', 'single_mask',
       'double_mask', 'triple_mask', 'quadruple_mask']]
y_train = df['study_eq_labels']
X_test = test_df[['number_of_conditions', 'number_of_groups',
       'num_locations', 'num_inclusion', 'num_exclusion',
       'number_of_intervention_types', 'sponsor_type', 'has_dmc', 'allocation',
       'enroll_count', 'healthy_vol', 'treatment_purpose',
       'diagnostic_purpose', 'prevention_purpose', 'supportive_purpose',
       'procedure_intervention', 'device_intervention',
       'behavioral_intervention', 'drug_intervention',
       'radiation_intervention', 'biological_intervention',
       'os_outcome_measure', 'dor_outcome_measure', 'ae_outcome_measure',
       'primary_max_days', 'secondary_max_days', 'max_treatment_duration',
       'min_treatment_duration', 'survival_5yr_relative',
       'conditions_category_num', 'youth', 'adult', 'us_loc', 'non_us_loc',
       'pi_resp', 'sponsor_resp', 'single_intervention',
       'parallel_intervention', 'other_intervention', 'no_mask', 'single_mask',
       'double_mask', 'triple_mask', 'quadruple_mask']]
y_train = df['study_eq_labels']
y_test = test_df['study_eq_labels']

## Random Forest

In [22]:
# check all params
clf = RandomForestClassifier(random_state=42)
pprint(clf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [26]:
# create grid for random search
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(1, 11, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [27]:
clf = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [44]:
clf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.5s
[CV] END bootstrap=False, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   0.9s
[CV] END bootstrap=False, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.0s
[CV] END bootstrap=False, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.0s
[CV] END bootstrap=True, max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; tota

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [45]:
# best params for random search
clf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': True}

In [58]:
# base model 
base_model = RandomForestClassifier(n_estimators = 200, random_state = 42)
base_model.fit(X_train, y_train)
y_base_pred = base_model.predict(X_test)
base_accuracy = accuracy_score(y_test, y_base_pred)
print(base_accuracy)


0.4666666666666667


In [62]:
base_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [59]:
# best random search
best_random = clf_random.best_estimator_
best_random.fit(X_train, y_train)
y_rand_pred = best_random.predict(X_test)
random_accuracy = accuracy_score(y_test, y_rand_pred)
print(random_accuracy)

0.43137254901960786


In [105]:
# manual param tuning
model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=2, min_samples_leaf=1, max_features='auto', bootstrap=True, max_depth=None, max_leaf_nodes=None)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.4745098039215686


In [101]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [53]:
confusion_matrix(y_test, y_base_pred)

array([[30,  8,  0,  0,  1],
       [14, 16,  9,  5,  4],
       [ 6,  7, 18, 18,  8],
       [ 9,  4,  9, 19, 15],
       [ 3,  3,  4,  9, 36]])