# Setup stuff

In [None]:
#=====[ Setup - don't modify ]=====
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys
import csv
import string
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Read dataset from disk

In [None]:
dataFilePath = #Point this to your own training sets
f = open(dataFilePath, 'rU')
dictReader = csv.DictReader(f)
data = []
for row in dictReader:
    data.append(row)

dataset = pd.DataFrame(data)

dataset.age_years = dataset.age_years.apply(float)

dataset.head()

print 'sample source: '
print dataset['sample_source'].value_counts()

In [None]:
dataset.groupby(['age_years', 'outcome']).count()

In [None]:
dataset.groupby(['sample_source', 'outcome']).count()

# Feature preparation

In [None]:
#we encode different features differently - this is the list features for each encoding type

scalar_encoding_features = [ #INSERT LIST OF FEATURES THAT YOU WANT SCALAR-ENCODED
]


production_one_hot_encoding_features = [ #INSERT LIST OF FEATURES THAT YOU WANT ONE-HOT-ENCODED
                            ]


production_discrete_encoding_features = [ # INSERT LIST OF FEATURES THAT YOU WANT DISCRETE-ENCODED
]


if run_encoding == 'default':
    one_hot_encoding_features = production_one_hot_encoding_features + production_discrete_encoding_features
    discrete_encoding_features = []
elif run_encoding == 'production':
    one_hot_encoding_features = production_one_hot_encoding_features
    discrete_encoding_features = production_discrete_encoding_features
elif run_encoding == 'scalar':
    one_hot_encoding_features = []
    discrete_encoding_features = []
    scalar_encoding_features += production_one_hot_encoding_features
    scalar_encoding_features += production_discrete_encoding_features
else:
    raise ValueError('Error, run_encoding: '+run_encoding+' not understood')

feature_encoding_map = {}
for feature in scalar_encoding_features:
    print 'add feature ', feature, ' to feature encoding map'
    feature_encoding_map[feature] = 'scalar'
for feature in one_hot_encoding_features:
    feature_encoding_map[feature] = 'one_hot'
for feature in discrete_encoding_features:
    feature_encoding_map[feature] = 'discrete'
print 'feature_encoding_map: ', feature_encoding_map

# --------------------------------------------------------------------------------------------

# Training Guardian Questionnaire Model 

# --------------------------------------------------------------------------------------------

We filter the dataset on age. The 2-3 range is too sparse, so we include 4 year olds also.

In [None]:
max_years = 4 if only_young_children else 100
min_years = 4 if only_old_children else 0
dataset = dataset[(dataset.age_years>=min_years) & (dataset.age_years<=max_years)].reset_index(drop=True)

import time
date_string = time.strftime("%-m.%-d.%y")

output_directory = 'output/'
filename_prefix = run_desc+'_'+date_string

print 'max_years: ', max_years
print 'run_desc: ', run_desc

## Adding FUSION features (young kids version)

In [None]:

if (do_feature_engineering and only_young_children) or restricted_features=='young_official':

    #HALIM: EXPERIMENTING WITH FUSION FEATURES HERE

    #fusion creation function
    def max_severity(data_row):
        return_value = 'missing'
        for data_element in data_row:
            try:
                value = int(data_element)
                if value > 4:
                    continue
                if return_value == 'missing':
                    return_value = value
                elif value>return_value:
                    return_value = value
            except ValueError:
                continue
        return str(return_value)

    #list of fusion features
    fusion_features = [ #INSERT YOUR OWN LIST OF FUSION FEATURES HERE 
    ]
        
    #fusion feature defintion
    for feature in fusion_features:
        dataset[feature] = dataset[[feature+'_current', feature+'_ever']].apply(max_severity, axis=1)

    #define their feature encoding as 'discrete'
    for feature in fusion_features:
        if run_encoding == 'default':
            feature_encoding_map[feature] = 'one_hot'
        elif run_encoding == 'production':
            feature_encoding_map[feature] = 'discrete'
        elif run_encoding == 'scalar':
            feature_encoding_map[feature] = 'scalar'
        else:
            raise ValueError('Error, run_encoding: '+run_encoding+' not understood')

    #include them in our features
    print 'adding in fusion_features: ', fusion_features
    feature_columns += fusion_features

## Build a model with all data as a warm-up, and do default feature selection

In [None]:
n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

outcome_column = 'outcome'
outcome_classes = ['autism','not']

outcome_class_priors =  [(1.0/2.0), (1.0/2.0)]       # IN CLINICAL CENTRES

dunno_range = (0.2,0.9)

number_of_features_to_keep = 25

print 'balance with balance_dimensions: ', balance_dimensions
sample_weights = balance_dataset_on_dimensions(dataset, balance_dimensions, verbose=False)
print 'age_as_feature: ', age_as_feature
if only_young_children==True:
    for index in range(len(dataset)):
        if dataset['age_years'][index]==3:
            sample_weights[index] *= 3
        if dataset['age_years'][index]==2:
            sample_weights[index] *= 2
            
all_results_list = []
feature_selection_iterations = 50 if run_choices=='default' else 1
cross_validation_age_weights = balance_dataset_on_dimensions(dataset, ['age_years'], verbose=False)

for pei in range(feature_selection_iterations):
    print 'On pseudo experiment ', pei, ' of ', feature_selection_iterations 

    model, features, y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs =\
        all_data_model(dataset, feature_columns, feature_encoding_map,
        outcome_column, sample_weights, dunno_range, RandomForestClassifier,  n_estimators = n_estimators,
        criterion = criterion, max_features = max_features, max_depth=max_depth)
  
    important_features = get_important_features(model, features, 0.001)
    
    metrics = get_classifier_performance_metrics(outcome_classes, outcome_class_priors, dataset[outcome_column], y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs)
    for feature in important_features:
        print(feature)
        print("\n") 
    print_classifier_performance_metrics(outcome_classes, metrics)

    top_feature_columns =  get_best_features(important_features, number_of_features_to_keep, ['='], [])
    default_top_N_features = cp.deepcopy(top_feature_columns[:N_for_top_N])
    print 'default_top_N_features: ', default_top_N_features, ', len: ', len(default_top_N_features)
    
    iteration_results_dict = {}
    for feature in features:
        if feature in default_top_N_features:
            iteration_results_dict[feature+'_present'] = 1
        else:
            iteration_results_dict[feature+'_present'] = 0

    ### Now compare each iteration of performance metrics 
    n_folds = 20
    if only_young_children==False:
        output = cross_validate_model(dataset, sample_weights, default_top_N_features,
            feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
            RandomForestClassifier,  validation_weights=cross_validation_age_weights, n_estimators = n_estimators,
            criterion = criterion, max_features = max_features, max_depth=max_depth)
    else:
        output = cross_validate_model_with_addon(dataset, sample_weights, default_top_N_features,
           feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
           RandomForestClassifier,  validation_weights=cross_validation_age_weights, n_estimators = n_estimators,
           criterion = criterion, max_features = max_features, max_depth=max_depth)
    iteration_results_dict['AUC'] = output['overall_metrics']['without_dunno']['auc']
    iteration_results_dict['autism recall'] = output['overall_metrics']['without_dunno']['dataset_recall_per_class']['autism']
    iteration_results_dict['not recall'] = output['overall_metrics']['without_dunno']['dataset_recall_per_class']['not']
    iteration_results_dict['autism precision [Dataset]'] = output['overall_metrics']['without_dunno']['dataset_precision_per_class']['autism']
    iteration_results_dict['not precision [Dataset]'] = output['overall_metrics']['without_dunno']['dataset_precision_per_class']['not']
    iteration_results_dict['autism precision [Reallife]'] = output['overall_metrics']['without_dunno']['reallife_precision_per_class']['autism']
    iteration_results_dict['not precision [Reallife]'] = output['overall_metrics']['without_dunno']['reallife_precision_per_class']['not']
    all_results_list.append(iteration_results_dict)
    print_classifier_performance_metrics(outcome_classes, output['overall_metrics'])

if run_choices=='default':
    all_results_df = pd.DataFrame(all_results_list)
    print 'all_results_df'
    all_results_df.to_csv(output_directory+"/"+filename_prefix+'_selection_variations.csv')

In [None]:
for pair in sorted(zip(features, model.feature_importances_), key=lambda x: x[1], reverse=True):
    print pair

## Build a model with [2-3yolds only] repeatedly and keep tabs on which features get used most often

In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

outcome_column = 'outcome'
outcome_classes = ['autism','not']

dunno_range = (0.2,0.9)


feature_tally = {}
number_of_features_to_keep = 15

print 'allowed features: '
for feature in feature_columns:
    print 'feature: ', feature, ', encoding: ', feature_encoding_map[feature]
    #if 'q46' in feature or 'q60' in feature:
    if True:
        print 'deatils: ', dataset[feature].value_counts()

#do the following many times
number_of_tries = 50 if restricted_features == 'tally_top_N' else 1
for i in range(0,number_of_tries):
    print 'On try number ', i, ' of ', number_of_tries
    
    if only_young_children:
        #grab a random subsample of 2-3 year olds only
        tally_dataset = dataset[dataset.age_years<4].reset_index(drop=True)
    else:
        tally_dataset = cp.deepcopy(dataset)

    dataset_for_this_try = subsample_per_class(tally_dataset, outcome_column, {'autism':0.9, 'not':0.9} )
    
     #sprinkle some random features
     dataset_for_this_try['random1'] = np.random.choice(3, len(dataset_for_this_try), p=[0.1, 0.6, 0.3])
     dataset_for_this_try['random2'] = np.random.choice(4, len(dataset_for_this_try), p=[0.25, 0.25, 0.25, 0.25])
     dataset_for_this_try['random3'] = np.random.choice(2, len(dataset_for_this_try), p=[0.6, 0.4])

    if i==0:
        print 'balancing dimensions: ', balance_dimensions
        print 'features and encodings: '
        for feature in feature_columns:
            print 'feature: ', feature, ', encoding: ', feature_encoding_map[feature]
        print 'dataset_for_this_try has len: ', len(dataset_for_this_try.index), ', ages: ', dataset_for_this_try['age_years'].value_counts() 
        
    sample_weights_for_this_try = balance_dataset_on_dimensions(dataset_for_this_try, balance_dimensions,
                                                verbose=False)

    #here we bump 3 years olds sample weights by a factor of 2 relative to the 2 year olds
    if only_young_children==True:
        for index in range(len(dataset_for_this_try)):
            if dataset_for_this_try['age_years'][index]==3:
                sample_weights_for_this_try[index] *= 2

       
    model, features, y_predicted_without_dunno, y_predicted_with_dunno,\
        y_predicted_probs = all_data_model(dataset_for_this_try,
        feature_columns, feature_encoding_map, outcome_column,
        sample_weights_for_this_try, dunno_range, RandomForestClassifier,  n_estimators = n_estimators,
        criterion = criterion, max_features = max_features, max_depth=max_depth)
    
    important_features = get_important_features(model, features, 0.01)
    
    top_feature_columns = get_best_features(important_features, number_of_features_to_keep, ['='], [])
    for feature in top_feature_columns:
        if feature in feature_tally:
            feature_tally[feature]+=1
        else:
            feature_tally[feature]=1

tally = sorted(feature_tally.items(), key=lambda pair: pair[1], reverse=True)
tally

print 'tally: ', tally
top_N_tally_features_superset = [ele[0] for ele in tally][:N_for_top_N+10]
print 'top_N_tallly_features_superset: ', top_N_tally_features_superset

### compare feature selection methods

In [None]:
 if restricted_features == 'tally_top_N':
     base_feature_sel_tuple = [(ele, default_top_N_features.index(ele), None if ele not in top_N_tally_features else top_N_tally_features.index(ele)) for ele in default_top_N_features]
     tally_feature_sel_tuple = [(ele, top_N_tally_features.index(ele), None if ele not in default_top_N_features else default_top_N_features.index(ele)) for ele in top_N_tally_features]
     print 'base_feature_sel_tuple: ', base_feature_sel_tuple
     print '\n\n'
     print 'tally_feature_sel_tuple: ', tally_feature_sel_tuple

### Now we restrict the feature set to the ones chosen often in the experiment above, and we build an all-data model to find the feature importance order

In [None]:
feature_columns = [x[0] for x in tally if (x[1]>=25 and 'random' not in x[0]) ]

for feature in feature_columns:
    print 'feature: ', feature
    print 'in df? ', (feature in dataset.columns)
    
print 'After possible restrictions,  features: ', feature_columns
n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

outcome_column = 'outcome'
outcome_classes = ['autism','not']

dunno_range = (0.2,0.9)


number_of_features_to_keep = N_for_top_N

sample_weights = balance_dataset_on_dimensions(dataset, balance_dimensions, verbose=False)

if only_young_children==True:
    for index in range(len(dataset)):
        if dataset['age_years'][index]==3:
            sample_weights[index] *= 3
        if dataset['age_years'][index]==2:
            sample_weights[index] *= 2


print 'feature_encoding_map[age_years]: ', feature_encoding_map['age_years']
print 'age years value counts: ', dataset['age_years'].value_counts()
model, features, y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs =\
    all_data_model(dataset, feature_columns, feature_encoding_map, outcome_column, sample_weights,
    dunno_range, RandomForestClassifier,  n_estimators = n_estimators, criterion = criterion,
    max_features = max_features, max_depth=max_depth)

important_features = get_important_features(model, features, 0.001)
top_feature_columns = get_best_features(important_features, number_of_features_to_keep, ['='], [])
print 'top_feature_columns end up being: ', top_feature_columns


metrics = get_classifier_performance_metrics(outcome_classes, outcome_class_priors, dataset[outcome_column], y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs)

print_classifier_performance_metrics(outcome_classes, metrics)

ordered_features = sorted(zip(features, model.feature_importances_), key=lambda x: x[1], reverse=True)
print "\nEncoded features by importance:"
for feature in ordered_features:
    print feature

    
print 'feature_encoding_map: ', feature_encoding_map

print 'Further refine this superset of features: ', sorted(feature_columns)
print 'To this set: ', sorted(top_feature_columns)
if restricted_features == 'tally_top_N':
    feature_columns = top_feature_columns
    base_feature_sel_tuple = [(ele, default_top_N_features.index(ele), None if ele not in feature_columns else feature_columns.index(ele)) for ele in default_top_N_features]
    tally_feature_sel_tuple = [(ele, feature_columns.index(ele), None if ele not in default_top_N_features else default_top_N_features.index(ele)) for ele in feature_columns]
    print 'base_feature_sel_tuple: ', base_feature_sel_tuple
    print '\n\n'
    print 'tally_feature_sel_tuple: ', tally_feature_sel_tuple

### Now we cross validate to gauge model performance

In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

class_weight = None#"balanced" #balanced_subsample"
outcome_column = 'outcome'
outcome_classes = ['autism','not']

dunno_range = (0.2,0.9)
n_folds = 20

print 'Validate autism vs non-autism weights'
dataset['weights'] = sample_weights
autism_df = dataset[dataset['outcome']=='autism']
not_df = dataset[dataset['outcome']=='not']
assert len(autism_df.index) + len(not_df.index) == len(dataset.index)
print 'young_dataset autism sum weights: ', np.sum(autism_df['weights'].values)
print 'young_dataset not sum weights: ', np.sum(not_df['weights'].values)



cross_validation_age_weights = balance_dataset_on_dimensions(dataset, ['age_years'], verbose=False)

#cross validate
print 'Cross validate with dataset of length: ', len(dataset.index)
print 'len young_sample_weights: ', len(sample_weights)
if only_young_children==False:
    output = cross_validate_model(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier,  validation_weights=cross_validation_age_weights, n_estimators = n_estimators,
        criterion = criterion, max_features = max_features, max_depth=max_depth)
else:
    output = cross_validate_model_with_addon(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier,  validation_weights=cross_validation_age_weights, n_estimators = n_estimators,
        criterion = criterion, max_features = max_features, max_depth=max_depth)
#print 'output has len: ', len(young_output), ', vals: ', young_output

print_classifier_performance_metrics(outcome_classes, output['overall_metrics'])


## Adding aggregate severity features

In [None]:
if do_feature_engineering:


    #aggregate severity creation functions
    def max_severity(data_row):
        return_value = 'missing'
        for data_element in data_row:
            try:
                value = int(data_element)
                if value<0 or value>4:
                    continue
                if return_value == 'missing':
                    return_value = value
                elif value>return_value:
                    return_value = value
            except ValueError:
                continue
        return str(return_value)
    def min_severity(data_row):
        return_value = 'missing'
        for data_element in data_row:
            try:
                value = int(data_element)
                if value<0 or value>4:
                    continue
                if return_value == 'missing':
                    return_value = value
                elif value<return_value:
                    return_value = value
            except ValueError:
                continue
        return str(return_value)

    def count_severity_level(data_row, severity_level):
        return_value = 0
        for data_element in data_row:
            if data_element==severity_level:
                return_value += 1
        return return_value


    #list of aggregate severity features
    aggregate_severity_features = ['max_severity',
                                   'min_severity', 
                                   'severity_0_count',
                                   'severity_1_count',
                                   'severity_2_count',
                                   'severity_3_count',
                                   'severity_4_count',
                                  ]


    #list of features to aggregate over
    features_to_aggregate_over = feature_columns


    #aggregate severity feature defintion
    dataset['max_severity'] = dataset[features_to_aggregate_over].apply(max_severity, axis=1)
    dataset['min_severity'] = dataset[features_to_aggregate_over].apply(min_severity, axis=1)
    dataset['severity_0_count'] = dataset[features_to_aggregate_over].apply(lambda row: count_severity_level(row, "0"), axis=1)
    dataset['severity_1_count'] = dataset[features_to_aggregate_over].apply(lambda row: count_severity_level(row, "1"), axis=1)
    dataset['severity_2_count'] = dataset[features_to_aggregate_over].apply(lambda row: count_severity_level(row, "2"), axis=1)
    dataset['severity_3_count'] = dataset[features_to_aggregate_over].apply(lambda row: count_severity_level(row, "3"), axis=1)
    dataset['severity_4_count'] = dataset[features_to_aggregate_over].apply(lambda row: count_severity_level(row, "4"), axis=1)


    #define their feature encoding as 'discrete'
    for feature in aggregate_severity_features:
        if run_encoding == 'default':
            feature_encoding_map[feature] = 'one_hot'
        elif run_encoding == 'production':
            if 'count' in feature:
                feature_encoding_map[feature] = 'scalar'
            else:
                feature_encoding_map[feature] = 'discrete'
        elif run_encoding == 'scalar':
            feature_encoding_map[feature] = 'scalar'      
        else:
            raise ValueError('Error, run_encoding: '+run_encoding+' not understood')

    #include them in our features
    print 'add aggregate_severity_features: ', aggregate_severity_features
    feature_columns += aggregate_severity_features

    dataset[aggregate_severity_features].head(20)

### Rebuild the model with all data to see if aggregate features were useful

In [None]:
n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

outcome_column = 'outcome'
outcome_classes = ['autism','not']

dunno_range = (0.2,0.9)


number_of_features_to_keep = 20

sample_weights = balance_dataset_on_dimensions(dataset, balance_dimensions, verbose=False)

if only_young_children==True:
    for index in range(len(dataset)):
        if dataset['age_years'][index]==3:
            sample_weights[index] *= 3
        if dataset['age_years'][index]==2:
            sample_weights[index] *= 2


model, features, y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs =\
    all_data_model(dataset, feature_columns, feature_encoding_map, outcome_column, sample_weights,
    dunno_range, RandomForestClassifier,  n_estimators = n_estimators, criterion = criterion,
    max_features = max_features, max_depth=max_depth, class_weight = class_weight)

important_features = get_important_features(model, features, 0.001)
    
metrics = get_classifier_performance_metrics(outcome_classes, outcome_class_priors, dataset[outcome_column], y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs)

print_classifier_performance_metrics(outcome_classes, metrics)

ordered_features = sorted(zip(features, model.feature_importances_), key=lambda x: x[1], reverse=True)
print "\nFeatures by importance:"
for feature in ordered_features:
    print feature

print 'feature_encoding_map: ', feature_encoding_map



### Now we cross validate again to gauge model performance with aggregate features added

In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 200
criterion = "entropy"
max_features = "log2"
max_depth = 5

class_weight = None#"balanced" #balanced_subsample"
outcome_column = 'outcome'
outcome_classes = ['autism','not']

dunno_range = (0.2,0.9)
n_folds = 20

#cross_validation_age_weights = None
cross_validation_age_weights = balance_dataset_on_dimensions(dataset, ['age_years'], verbose=False)
print 'feature columns to do and encoding: '
for column in feature_columns:
    print column, ', ', feature_encoding_map[column]
if only_young_children==False:
    print 'Do cross validation with age as a feature.'
    output = cross_validate_model(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier,  validation_weights=cross_validation_age_weights,  n_estimators = n_estimators, criterion = criterion, max_features = max_features,
        max_depth=max_depth)
else:
    print 'Do cross validation without age as a feature.'
    output = cross_validate_model_with_addon(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier,  validation_weights=cross_validation_age_weights, n_estimators = n_estimators, criterion = criterion, max_features = max_features,
        max_depth=max_depth)

print_classifier_performance_metrics(outcome_classes, output['overall_metrics'])

print 'feature_encoding_map: ', feature_encoding_map



## Grid search for best decision forest model parameters

In [None]:
#### Grid search modeling function
print 'balance dimensions: ', balance_dimensions

def modeling_function(param_combination):
    def sampling_function_per_try(dataset):
        sample = subsample_per_class(dataset, outcome_column, {'autism':bootstrapping_sample_percent, 'not':bootstrapping_sample_percent})
        return sample
    def ml_function_per_try(dataset_per_try):
        sample_weights_per_try = balance_dataset_on_dimensions(dataset_per_try, balance_dimensions)
        sample_cross_validation_age_weights = balance_dataset_on_dimensions(dataset_per_try, ['age_years'], verbose=False)

        dataset_per_try['weights'] = sample_weights_per_try
        autism_df = dataset_per_try[dataset_per_try['outcome']=='autism']
        not_df = dataset_per_try[dataset_per_try['outcome']=='not']
        assert len(autism_df.index) + len(not_df.index) == len(dataset_per_try.index)
        if only_young_children==True:
            for index in range(len(dataset_per_try)):
                if dataset_per_try['age_years'][index]==3:
                    sample_weights_per_try[index] *= 3
                if dataset_per_try['age_years'][index]==2:
                    sample_weights_per_try[index] *= 2

        if only_young_children==False:
            metrics = cross_validate_model(dataset_per_try, sample_weights_per_try, feature_columns,
                        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, 
                        outcome_class_priors, RandomForestClassifier, validation_weights=sample_cross_validation_age_weights,                         criterion = param_combination[0], 
                        max_features = param_combination[1], max_depth=param_combination[2], 
                        n_estimators = param_combination[3])
        else:
            metrics = cross_validate_model_with_addon(dataset_per_try, sample_weights_per_try, feature_columns,
                        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, 
                        outcome_class_priors, RandomForestClassifier, validation_weights=sample_cross_validation_age_weights,  
                        criterion = param_combination[0], 
                        max_features = param_combination[1], max_depth=param_combination[2], 
                        n_estimators = param_combination[3])
        return metrics['overall_metrics']


    averaged_metrics, averaged_metrics_err =  bootstrap(dataset, bootstrapping_number_of_tries, sampling_function_per_try,
                                  ml_function_per_try, return_errs=True, verbose=False)
    print 'For param_combination: ', param_combination, ', AUC: ', averaged_metrics['without_dunno']['auc'], ' +/- ',\
                 averaged_metrics_err['without_dunno']['auc']
    sys.stdout.flush()
    return averaged_metrics, averaged_metrics_err

In [None]:
if do_grid_search:
    print 'feature_encoding_map: ', feature_encoding_map

    n_autism = len(dataset[dataset['outcome']=='autism'].index)
    n_not = len(dataset[dataset['outcome']=='not'].index)
    print 'n_autism: ', n_autism
    print 'n_not: ', n_not
    print 'n_total: ', len(dataset.index)
    assert n_autism + n_not == len(dataset.index)
    print 'About to run grid search for ', run_desc
    print 'ages in dataset: ', dataset['age_years'].value_counts()
    print 'balance on: ', balance_dimensions
    print 'Features list: ', feature_columns
    print 'Details of features to use in grid search:'
    for feature in feature_columns:
        print feature, ', encoding: ', feature_encoding_map[feature]
        print 'values: ', dataset[feature].value_counts()
    sys.stdout.flush()

    
    #this is where we output to
    output_filename = output_directory+"/"+filename_prefix+".gridSearch.modelParams.csv"

    #these are the ML model params that we will grid search over
    criterion = ["entropy"]
    max_features = ['log2', 'sqrt', 0.3]
    max_depth = [4, 5, 6, 7, 8]
    n_estimators = [100, 200]
    param_combinations = get_combinations([criterion, max_features, max_depth, n_estimators])

    #these are bootstrapping parameters
    bootstrapping_number_of_tries = 10     #run every node in the grid search this many times and average out the resulting metrics
    bootstrapping_sample_percent = 0.9    #for every run, random-sample this percentage of the dataset (stratified by target class) 


    #these static params are problem specific
    n_folds = 20

    #this dunno range param is outside of the ML model so let's fix it to some convenient values for now
    dunno_range = (0.2,0.9)


    reporting_function = lambda param_combination, (averaged_metrics, averaged_metrics_err): [ averaged_metrics['without_dunno']['auc'],
                                                                  averaged_metrics_err['without_dunno']['auc'],
                                                                  averaged_metrics['without_dunno']['dataset_precision_per_class']['autism'],
                                                                  averaged_metrics['without_dunno']['reallife_precision_per_class']['autism'],
                                                                  averaged_metrics['without_dunno']['dataset_recall_per_class']['autism'],
                                                                  averaged_metrics_err['without_dunno']['dataset_recall_per_class']['autism'],
                                                                  averaged_metrics['without_dunno']['dataset_precision_per_class']['not'],
                                                                  averaged_metrics['without_dunno']['reallife_precision_per_class']['not'],
                                                                  averaged_metrics['without_dunno']['dataset_recall_per_class']['not'], 
                                                                  averaged_metrics_err['without_dunno']['dataset_recall_per_class']['not'],
                                                                  n_autism,
                                                                  n_not,
                                                                 ]


    #run grid search
    report = grid_search(modeling_function, param_combinations, reporting_function)

    #write outputs to file
    output_file = open(output_filename,'w')
    header = ','.join(['criterion','max_features', 'max_depth', 'n_trees','AUC','AUC err', 'autism precision [Dataset]', 
                   'autism precision [Reallife]', 'autism recall', 'autism recall err', 'not precision [Dataset]',
                   'not precision [Reallife]', 'not recall', 'not recall err', 'n_autism', 'n_not'])
    output_file.write(header+"\n")
    for line in report:
        output_file.write(','.join([str(x) for x in line]))
        output_file.write("\n")
    output_file.close()

    print 'Grid search done for run_desc: ', run_desc

else:
    print 'Skip grid search'


## Grid search for best dunno range

In [None]:
#do_dunno_grid_search = True
if do_dunno_grid_search:
    #this is where we output to
    output_filename = output_directory+"/"+filename_prefix+".dunnoGridSearch.modelParams.csv"



    #these are bootstrapping parameters
    bootstrapping_number_of_tries = 10     #run every node in the grid search this many times and average out the resulting metrics
    bootstrapping_sample_percent = 0.9    #for every run, random-sample this percentage of the dataset (stratified by target class) 


    #these static params are problem specific
    n_folds = 20

    #these params are outside of the ML model so let's fix them to some convenient values for now
    dunno_range_min = [0.15, 0.2, 0.25, 0.3, 0.35]
    dunno_range_max = [0.65, 0.7, 0.75, 0.8, 0.85]
    param_combinations = get_combinations([dunno_range_min, dunno_range_max])



    def modeling_function(param_combination):
        def sampling_function_per_try(dataset):
            sample = subsample_per_class(dataset, outcome_column, {'autism':bootstrapping_sample_percent, 'not':bootstrapping_sample_percent})
            return sample
        def ml_function_per_try(dataset_per_try):
            sample_weights_per_try = balance_dataset_on_dimensions(dataset_per_try, balance_dimensions)
            sample_cross_validation_age_weights = balance_dataset_on_dimensions(dataset_per_try, ['age_years'], verbose=False)
            #AS A HACK, here we bump 3 years olds sample weights by a factor of 3 relative to the 4 year olds and 2 year olds
            if only_young_children==True:
                for index in range(len(dataset_per_try)):
                    if dataset_per_try['age_years'][index]==3:
                        sample_weights_per_try[index] *= 3
                    if dataset_per_try['age_years'][index]==2:
                        sample_weights_per_try[index] *= 2
            if only_young_children==False:
                metrics = cross_validate_model(dataset_per_try, sample_weights_per_try, feature_columns, feature_encoding_map, outcome_column, (param_combination[0], param_combination[1]), n_folds, outcome_classes, outcome_class_priors, RandomForestClassifier,   validation_weights=sample_cross_validation_age_weights,  criterion = criterion, max_features = max_features,  max_depth = max_depth, n_estimators = n_estimators)

            else:
                metrics = cross_validate_model_with_addon(dataset_per_try, sample_weights_per_try, feature_columns, feature_encoding_map, outcome_column, (param_combination[0], param_combination[1]), n_folds, outcome_classes, outcome_class_priors, RandomForestClassifier,  validation_weights=sample_cross_validation_age_weights, criterion = criterion, max_features = max_features,  max_depth = max_depth, n_estimators = n_estimators)

            return metrics['overall_metrics']

        averaged_metrics =  bootstrap(dataset, bootstrapping_number_of_tries, sampling_function_per_try, ml_function_per_try)
        print 'For param_combination: ', param_combination, ', coverage: ', averaged_metrics['with_dunno']['reallife_classification_rate'],\
                 ', autism recall: ', averaged_metrics['with_dunno']['dataset_recall_per_class_where_classified']['autism'], ', not recall: ',\
                  averaged_metrics['with_dunno']['dataset_recall_per_class_where_classified']['not'], ', auc: ', averaged_metrics['with_dunno']['auc']
        return averaged_metrics



    reporting_function = lambda param_combination, averaged_metrics: [ averaged_metrics['with_dunno']['dataset_classification_rate'],
                                                              averaged_metrics['with_dunno']['reallife_classification_rate'],
                                                              averaged_metrics['with_dunno']['dataset_precision_per_class']['autism'],
                                                              averaged_metrics['with_dunno']['reallife_precision_per_class']['autism'],
                                                              averaged_metrics['with_dunno']['dataset_recall_per_class']['autism'],
                                                              averaged_metrics['with_dunno']['dataset_precision_per_class']['not'],
                                                              averaged_metrics['with_dunno']['reallife_precision_per_class']['not'],
                                                              averaged_metrics['with_dunno']['dataset_recall_per_class']['not'],
                                                              averaged_metrics['with_dunno']['auc']
                                                            ]




    #run grid search
    report = grid_search(modeling_function, param_combinations, reporting_function)

    #write outputs to file
    output_file = open(output_filename,'w')
    header = '\t'.join(['dunno_range_min','dunno_range_max', 
                    'classification rate [Dataset]',
                    'classification rate [Reallife]',
                    'autism precision [Dataset]', 
                    'autism precision [Reallife]', 
                    'autism recall', 
                    'not precision [Dataset]', 
                    'not precision [Reallife]', 
                    'not recall',
                    'auc'])
    output_file.write(header+"\n")
    report = grid_search(modeling_function, param_combinations, reporting_function)
    for line in report:
        output_file.write('\t'.join([str(x) for x in line]))
        output_file.write("\n")
    output_file.close()




## Use optimal model parameters and optimal dunno range to cross validate model

In [None]:
print 'max_features: ', max_features
print 'criterion: ', criterion
print 'n_estimators: ', n_estimators

cross_validation_age_weights = balance_dataset_on_dimensions(dataset, ['age_years'], verbose=False)

if only_young_children==False:
    output = cross_validate_model(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier,  validation_weights=cross_validation_age_weights,  n_estimators = n_estimators, criterion = criterion, max_features = max_features,
        max_depth=max_depth, class_weight = class_weight)
else:
    output = cross_validate_model_with_addon(dataset, sample_weights, feature_columns,
        feature_encoding_map, outcome_column, dunno_range, n_folds, outcome_classes, outcome_class_priors,
        RandomForestClassifier, validation_weights=cross_validation_age_weights, n_estimators = n_estimators, criterion = criterion, max_features = max_features,
        max_depth=max_depth, class_weight = class_weight)

print_classifier_performance_metrics(outcome_classes, output['overall_metrics'])



## Now build the final model using all corresponding samples

In [None]:
#this is where we output to
output_filename = output_directory+filename_prefix+".model"

#build model
model, features, y_predicted_without_dunno, y_predicted_with_dunno, y_predicted_probs =\
    all_data_model(dataset, feature_columns, feature_encoding_map, outcome_column, sample_weights,
    dunno_range, RandomForestClassifier,  n_estimators = n_estimators, criterion = criterion,
    max_features = max_features, max_depth=max_depth, class_weight = class_weight)

#save features into a separate file
output_filename = output_directory+filename_prefix+".features.txt"
output_file = open(output_filename,'w')
ordered_features = sorted(zip(features, model.feature_importances_), key=lambda x: x[1], reverse=True)
output_file.write("QUESTIONS BY IMPORTANCE:\n\n")
written_already = []
for feature in [x[0].split('=')[0] for x in ordered_features]:
    if feature not in written_already:
        written_already += [feature]
        output_file.write(feature)
        output_file.write("\n")
output_file.write("\n\nFEATURES BY IMPORTANCE:\n\n")
for pair in ordered_features:
    output_file.write(str(pair[0])+"\t"+str(pair[1]))
    output_file.write("\n")
output_file.close()


