Project: Forecasting Patient Enrolment for Clinical Trials

Supervisor: Niklas Frühauf, Sovanta

Authors:
*   Luka Biedebach
*   Weiyi Chen
*   Giang Hoang
*   Carolin Holtermann
*   Stefan Sousa

# Setup Google Drive

In [2]:
# if you change something or use other transformed input, please change version!!
__VERSION__ = "1.7.1"

""" HISTORY
1.0: 25.8.20
1.1: 27.08.20: Save original space_search['model'] into result
1.1.1: 01.09.20: Refactor classifier. The result should be comparable with version 1.1
1.2: 01.09.20: Change Timeoutlierremover
1.2.1: 01.09.20: Refactor classifier
1.3: 04.09.2020: Change splitting method to time series split
1.3.1: 05.09.2020: Use TimeOutlierRemover before splitting data
1.4: Add Regression for Target Encoding
1.5: Add new feature selector & remove useless column (07.09.20)
1.6.: Fixed type problem
1.7: Fixed holdout set bug in classification
1.7.1: change oder of scaling and feature selection. Scaling before feature selection
1.7.2: fix bugs in space params
"""

" HISTORY\n1.0: 25.8.20\n1.1: 27.08.20: Save original space_search['model'] into result\n1.1.1: 01.09.20: Refactor classifier. The result should be comparable with version 1.1\n1.2: 01.09.20: Change Timeoutlierremover\n1.2.1: 01.09.20: Refactor classifier\n1.3: 04.09.2020: Change splitting method to time series split\n1.3.1: 05.09.2020: Use TimeOutlierRemover before splitting data\n1.4: Add Regression for Target Encoding\n1.5: Add new feature selector & remove useless column (07.09.20)\n1.6.: Fixed type problem\n1.7: Fixed holdout set bug in classification\n1.7.1: change oder of scaling and feature selection. Scaling before feature selection\n1.7.2: fix bugs in space params\n"

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/Team\ Project\ (Sovanta)/Google\ Collab/

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
4/4AEHLIk_DtKaxfW1jiVgp38TdDP1MBcn0i2h6rXvS06P8slQypyEXt4
Mounted at /content/drive
/content/drive/My Drive/Team Project (Sovanta)/Google Collab


# Imports

In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import mixture
import xgboost as xgb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK, fmin
from hyperopt.pyll.stochastic import sample
from hyperopt.mongoexp import MongoTrials
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, KBinsDiscretizer
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.decomposition as decomposition
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
import time
from pymongo import MongoClient

from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from itertools import *
# from sklearn.mixture import GaussianMixture

#optional but advised
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)  # show all columns of dataframe

  import pandas.util.testing as tm


# Optional transformers

## TimeOutlierRemover Transformer

In [5]:
#Custom transformer that transforms data set to only contain data after the timeperiod defined
class TimeOutlierRemover( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, startYear = 1995, endYear = 2018):
        # startDate = datetime(startYear, 1, 1)
        # endDate = datetime(endYear, 12, 31)
        self._startYear = startYear
        self._endYear = endYear

    def fit( self, X, y = None ):
        return self
    
    def transform(self, X , y = None ):
        X_new = X.copy()
        X_new = X_new[(X_new['StartYear'] >= self._startYear) & (X_new['StartYear'] <= self._endYear)]
        X_new = X_new.reset_index(drop=True)
        return X_new

## EnrollmentOutlierRemover Transformer

In [6]:
#Custom transformer that transforms data set to remove outliers in the EnrollmentCount
# !!! Must be called after OHE
class EnrollmentOutlierRemover( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, strategy = "IQR"):
        self._strategy = strategy
        self._range = {}
        self._phases = ['phase1', 'phase1_2', 'phase2', 'phase2_3', 'phase3']
        self._phase_series = {}

    def fit(self, X, y = None ):
        # Get data of the different phases
        
        self._phase_series['phase1'] = X.loc[(X['Phase=Phase 1'] == 1) & (X["Phase=Phase 2"] == 0)]['EnrollmentCount_new']
        self._phase_series['phase1_2'] = X.loc[(X['Phase=Phase 1'] == 1) & (X["Phase=Phase 2"] == 1)]['EnrollmentCount_new']
        self._phase_series['phase2'] = X.loc[(X['Phase=Phase 1'] == 0) & (X["Phase=Phase 2"] == 1) & (X["Phase=Phase 3"] == 0)]['EnrollmentCount_new']
        self._phase_series['phase2_3'] = X.loc[ (X["Phase=Phase 2"] == 1) & (X["Phase=Phase 3"] == 1)]['EnrollmentCount_new']
        self._phase_series['phase3'] = X.loc[ (X["Phase=Phase 2"] == 0) & (X["Phase=Phase 3"] == 1)]['EnrollmentCount_new']
        
        # Calculcate outlier ranges
        if self._strategy == "IQR":
            for i in self._phases:
                IQR = self._phase_series[i].quantile(0.75) - self._phase_series[i].quantile(0.25)
                self._range[i] = [self._phase_series[i].median() - (1.5 * IQR), self._phase_series[i].median() + (1.5 * IQR)]
                
                
        if self._strategy == "MAD":
            for i in self._phases:
                MAD = self._phase_series[i].mad()
                self._range[i] = [self._phase_series[i].median() - (2 * MAD), self._phase_series[i].median() + (2 * MAD)]
        return self

    
    def transform(self, X , y = None ):
        X_new = X.copy()
        for index, row in X_new.iterrows():
            if row['Phase=Phase 1'] == 1 and row['Phase=Phase 2'] == 0:
                if row['EnrollmentCount_new'] < self._range['phase1'][0] or row['EnrollmentCount_new'] > self._range['phase1'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 1'] == 1 and row['Phase=Phase 2'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase1_2'][0] or row['EnrollmentCount_new'] > self._range['phase1_2'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 1'] == 0 and row['Phase=Phase 2'] == 1 and row['Phase=Phase 3'] == 0:
                if row['EnrollmentCount_new'] < self._range['phase2'][0] or row['EnrollmentCount_new'] > self._range['phase2'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 2'] == 1 and row['Phase=Phase 3'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase2_3'][0] or row['EnrollmentCount_new'] > self._range['phase2_3'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 2'] == 0 and row['Phase=Phase 3'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase3'][0] or row['EnrollmentCount_new'] > self._range['phase3'][1]:
                    X_new.drop([index], inplace = True)
        return X_new

## Transformers for feature selection

In [7]:
from sklearn.model_selection import train_test_split
# memory management
import gc
# utilities
from itertools import chain

from sklearn.ensemble import RandomForestRegressor

class FeatureSelectorTransformer( BaseEstimator, TransformerMixin ):
    """
    Class for performing feature selection for machine learning or data preprocessing.

    Implements five different methods to identify features for removal 
    
        1. Find columns with a single unique value
        2. Find collinear variables with a correlation greater than a specified correlation coefficient
        3. Find features with 0.0 feature importance from a gradient boosting machine (lgbm)
        4. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
        
    Parameters
    --------
        target : array or series, default = None
            Array of target labels for training the machine learning model to find feature importances. 
            These can be either binary labels (if ml_task is 'classification') or 
            continuous targets (if ml_task is 'regression').
            If no labels are provided, then the feature importance based methods are not available.
        correlation_threshold : float between 0 and 1
            Value of the Pearson correlation cofficient for identifying correlation features
            Percentage of feature correlation above which a feature is eligible for exclusion.
            Default value = 0.98
        ml_task : string
            The machine learning task, either 'classification' or 'regression'
            Default value = "regression"
        cumulative_importance : float between 0 and 1
            The fraction of cumulative importance to account for
            
        
    Attributes
    --------
    
    ops : dict
        Dictionary of operations run and features identified for removal
        
    unique_stats : dataframe
        Number of unique values for all features
    
    record_single_unique : dataframe
        Records the features that have a single unique value
        
    corr_matrix : dataframe
        All correlations between all features in the data
    
    record_collinear : dataframe
        Records the pairs of collinear variables with a correlation coefficient above the threshold
        
    feature_importances : dataframe
        All feature importances from the gradient boosting machine
    
    record_zero_importance : dataframe
        Records the zero importance features in the data according to the gbm
    
    record_low_importance : dataframe
        Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm
    
    
    Notes
    --------
    
        - All 5 operations can be run with the `identify_all` method.
    
    """
    
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, 
                 target, 
                 correlation_threshold = 0.98,
                 ml_task = "regression",
                 cumulative_importance = 0.99
                ):
        
        self.one_hot_features = None
        self.ml_task = ml_task
        
        # Dataframes recording information about features to remove
        self.record_single_unique = None
        self.record_collinear = None
        self.record_zero_importance = None
        self.record_low_importance = None
        
        self.unique_stats = None
        self.corr_matrix = None
        self.feature_importances = None
        
        # Dictionary to hold removal operations
        self.ops = {}
        
        self.one_hot_correlated = False
        
        self.target = target
        
        # Thresholds
        self.correlation_threshold = correlation_threshold
        self.cumulative_importance = cumulative_importance

        
    def fit( self, X, y = None ):
        self.base_features = list(X.columns)
        return self


    def identify_single_unique(self, X):
        """Finds features with only a single unique value. NaNs do not count as a unique value. """

        # Calculate the unique counts in each column
        unique_counts = X.nunique()
        self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'})
        self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True)
        
        # Find the columns with only one unique count
        record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature', 
                                                                                                                0: 'nunique'})

        to_drop = list(record_single_unique['feature'])
    
        self.record_single_unique = record_single_unique
        self.ops['single_unique'] = to_drop
        
        print('%d features with a single unique value.\n' % len(self.ops['single_unique']))
        
    
    def identify_collinear(self, X):
        """
        Finds collinear features based on the correlation coefficient between features. 
        For each pair of features with a correlation coefficient greather than `correlation_threshold`,
        only one of the pair is identified for removal. 
        Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
        
        """
        
        # Calculate the correlations between every column
        self.corr_matrix = X.corr()
    
        # Extract the upper triangle of the correlation matrix
        upper = self.corr_matrix.where(np.triu(np.ones(self.corr_matrix.shape), k = 1).astype(np.bool))
        
        # Select the features with correlations above the threshold
        # Need to use the absolute value
        to_drop = [column for column in upper.columns if any(upper[column].abs() > self.correlation_threshold)]

        # Dataframe to hold correlated pairs
        record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

        # Iterate through the columns to drop to record pairs of correlated features
        for column in to_drop:

            # Find the correlated features
            corr_features = list(upper.index[upper[column].abs() > self.correlation_threshold])

            # Find the correlated values
            corr_values = list(upper[column][upper[column].abs() > self.correlation_threshold])
            drop_features = [column for _ in range(len(corr_features))]    

            # Record the information (need a temp df for now)
            temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                             'corr_feature': corr_features,
                                             'corr_value': corr_values})

            # Add to dataframe
            record_collinear = record_collinear.append(temp_df, ignore_index = True)

        self.record_collinear = record_collinear
        self.ops['collinear'] = to_drop
        
        print('%d features with a correlation magnitude greater than %0.2f.\n' % (len(self.ops['collinear']), self.correlation_threshold))

        
    def identify_zero_importance(self, X, eval_metric=None, 
                                 n_iterations=10, early_stopping = True):
        """
        
        Identify the features with zero importance according to a gradient boosting machine.
        The gbm can be trained with early stopping using a validation set to prevent overfitting. 
        The feature importances are averaged over `n_iterations` to reduce variance. 
        
        Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html)
        Parameters 
        --------
        eval_metric : string
            Evaluation metric to use for the gradient boosting machine for early stopping. Must be
            provided if `early_stopping` is True
        n_iterations : int, default = 10
            Number of iterations to train the gradient boosting machine
        early_stopping : boolean, default = True
            Whether or not to use early stopping with a validation set when training
        
        
        Notes
        --------
        - The gbm is not optimized for any particular task and might need some hyperparameter tuning
        - Feature importances, including zero importance features, can change across runs
        """

        if early_stopping and eval_metric is None:
            raise ValueError("""eval metric must be provided with early stopping. Examples include "auc" for classification or
                             "l2" for regression.""")
            
        if self.target is None:
            raise ValueError("No training labels provided.")
        
        feature_names = list(X.columns)

        # Convert to np array
        features = np.array(X)
        target = np.array(self.target).reshape((-1, ))

        # Empty array for feature importances
        feature_importance_values = np.zeros(len(feature_names))
        
        print('Training Gradient Boosting Model\n')
            
        # Iterate through each fold
        for _ in range(n_iterations):

            if self.ml_task == 'classification':
                model = lgb.LGBMClassifier(n_estimators = 1000, 
                                           learning_rate = 0.05, 
                                           verbose = -1)

            elif self.ml_task == 'regression':
                model = lgb.LGBMRegressor(boosting_type = 'goss',
                                          colsample_bytree = 0.79,
                                          max_bin = 103,
                                          num_leaves = 40,
                                          n_estimators = 1000, 
                                          learning_rate = 0.13, 
                                          max_depth = 56,
                                          min_child_weight = 1.85,
                                          min_split_gain = 4.3,
                                          objective = "tweedie",
                                          subsample = 0.8,
                                          verbose = -1)              
                
                

            else:
                raise ValueError('Task must be either "classification" or "regression"')

            # If training using early stopping need a validation set
            if early_stopping:

                train_features, valid_features, train_labels, valid_labels = train_test_split(features, 
                                                                                              target, 
                                                                                              test_size = 0.15, 
                                                                                              random_state=42)

                # Train the model with early stopping
                model.fit(train_features, train_labels, eval_metric = eval_metric,
                          eval_set = [(valid_features, valid_labels)],
                          early_stopping_rounds = 100, verbose = -1)

                # Clean up memory
                gc.enable()
                del train_features, train_labels, valid_features, valid_labels
                gc.collect()

            else:
                print("Fit LGBM Model with best params")
                model.fit(features, target)
                print("Done fitting")

            # Record the feature importances
            feature_importance_values += model.feature_importances_ / n_iterations
                
    

        feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
                      

        # Sort features according to importance
        feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

        # Normalize the feature importances to add up to one
        feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
        feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

        # Extract the features with zero importance
        record_zero_importance = feature_importances[feature_importances['importance'] == 0.0]
        
        to_drop = list(record_zero_importance['feature'])

        self.feature_importances = feature_importances
        self.record_zero_importance = record_zero_importance
        self.ops['zero_importance'] = to_drop
                      
        print("Feature importances:")
        print(self.feature_importances)
        
        print('\n%d features with zero importance.\n' % len(self.ops['zero_importance']))
                     
            
    def identify_low_importance(self, X):
        """
        Finds the lowest importance features not needed to account for `cumulative_importance` fraction
        of the total feature importance from the gradient boosting machine. As an example, if cumulative
        importance is set to 0.95, this will retain only the most important features needed to 
        reach 95% of the total feature importance. The identified features are those not needed.
        """
        
        # The feature importances need to be calculated before running
        if self.feature_importances is None:
            raise NotImplementedError("""Feature importances have not yet been determined. 
                                         Call the `identify_zero_importance` method first.""")
            
        # Make sure most important features are on top
        self.feature_importances = self.feature_importances.sort_values('cumulative_importance')

        # Identify the features not needed to reach the cumulative_importance
        record_low_importance = self.feature_importances[self.feature_importances['cumulative_importance'] > 
                                                         self.cumulative_importance]

        to_drop = list(record_low_importance['feature'])

        self.record_low_importance = record_low_importance
        self.ops['low_importance'] = to_drop
    
        print('%d features required for cumulative importance of %0.2f.' % 
              (len(self.feature_importances) - len(self.record_low_importance), self.cumulative_importance))
        print('%d features do not contribute to cumulative importance of %0.2f.\n' % (len(self.ops['low_importance']),
                                                                                               self.cumulative_importance))
        
    def remove(self, X, methods):
        """
        Remove the features from the data according to the specified methods.
        
        Parameters
        --------
            methods : 'all' or list of methods
                If methods == 'all', any methods that have identified features will be used
                Otherwise, only the specified methods will be used.
                Can be one of ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance']
                
        Return
        --------
            data : dataframe
                Dataframe with identified features removed
        
        """
        
        
        features_to_drop = []
      
        if methods == 'all':
                                                      
            print('{} methods have been run\n'.format(list(self.ops.keys())))
            
            # Find the unique features to drop
            features_to_drop = set(list(chain(*list(self.ops.values()))))
            
        else:                
            # Iterate through the specified methods
            for method in methods:
                
                # Check to make sure the method has been run
                if method not in self.ops.keys():
                    raise NotImplementedError('%s method has not been run' % method)
                    
                # Append the features identified for removal
                else:
                    features_to_drop.append(self.ops[method])
        
            # Find the unique features to drop
            features_to_drop = set(list(chain(*features_to_drop)))
            
        features_to_drop = list(features_to_drop)
            
       
        # Remove the features and return the data
        X.drop(columns = features_to_drop, inplace = True)
        self.removed_features = features_to_drop
        
        print('Removed %d features.' % len(features_to_drop))
        
        return X

    
    def transform(self, X , y = None ):
        
        X_new = X.copy()

        #1. Find columns with a single unique value
        self.identify_single_unique(X_new)
        
        #2. Find collinear variables with a correlation greater than a specified correlation coefficient
        # Identify collinear features
        self.identify_collinear(X = X_new)        
        # list of collinear features to remove
        collinear_features = self.ops['collinear']
        # dataframe of collinear features
        self.record_collinear.head()        
        
        #3. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
        # Pass in the appropriate parameters
        self.identify_zero_importance(X = X_new,
                                      eval_metric = 'auc',
                                      n_iterations = 10,
                                      early_stopping = False)
        # list of zero importance features
        zero_importance_features = self.ops['zero_importance']
        
        #4. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
        self.identify_low_importance(X = X_new)
        self.feature_importances.head(10)
        
        # Remove the features from all methods 
        # To also remove the one-hot encoded features that are created during machine learning, set "keep_one_hot" to True
        X_new = self.remove(X = X_new, methods = 'all')
                          

        return X_new

# Read csv file

In [8]:
new_data = pd.read_csv('pipeline_output.csv', sep=";")
new_data.drop(columns = 'Unnamed: 0', inplace = True)
new_data.head()

Unnamed: 0,EnrollmentDuration,DesignAllocation=Non-Randomized,DesignAllocation=Randomized,DesignInterventionModel=Crossover Assignment,DesignInterventionModel=Factorial Assignment,DesignInterventionModel=Parallel Assignment,DesignInterventionModel=Sequential Assignment,DesignInterventionModel=Single Group Assignment,DesignPrimaryPurpose=Basic Science,DesignPrimaryPurpose=Device Feasibility,DesignPrimaryPurpose=Diagnostic,DesignPrimaryPurpose=Educational/Counseling/Training,DesignPrimaryPurpose=Health Services Research,DesignPrimaryPurpose=Other,DesignPrimaryPurpose=Prevention,DesignPrimaryPurpose=Screening,DesignPrimaryPurpose=Supportive Care,DesignPrimaryPurpose=Treatment,EnrollmentType=Actual,EnrollmentType=Anticipated,Gender=All,Gender=Female,Gender=Male,HealthyVolunteers=Accepts Healthy Volunteers,HealthyVolunteers=No,IsFDARegulatedDevice=No,IsFDARegulatedDevice=Yes,IsFDARegulatedDrug=No,IsFDARegulatedDrug=Yes,OrgClass=FED,OrgClass=INDIV,OrgClass=INDUSTRY,OrgClass=NETWORK,OrgClass=NIH,OrgClass=OTHER,OrgClass=OTHER_GOV,OrgClass=UNKNOWN,Phase=Phase 2,Phase=Phase 1,Phase=Phase 4,Phase=Phase 3,Phase=Early Phase 1,StdAge=Child,StdAge=Adult,StdAge=Older Adult,CollaboratorClass=FED,CollaboratorClass=OTHER,CollaboratorClass=UNKNOWN,CollaboratorClass=AMBIG,CollaboratorClass=INDIV,CollaboratorClass=NETWORK,CollaboratorClass=INDUSTRY,CollaboratorClass=OTHER_GOV,CollaboratorClass=NIH,LeadSponsorClass=FED,LeadSponsorClass=OTHER,LeadSponsorClass=UNKNOWN,LeadSponsorClass=INDIV,LeadSponsorClass=NETWORK,LeadSponsorClass=INDUSTRY,LeadSponsorClass=OTHER_GOV,LeadSponsorClass=NIH,ConditionBrowseBranchAbbrev=BC16,ConditionBrowseBranchAbbrev=BC18,ConditionBrowseBranchAbbrev=BC24,ConditionBrowseBranchAbbrev=BC05,ConditionBrowseBranchAbbrev=BC03,ConditionBrowseBranchAbbrev=BC15,ConditionBrowseBranchAbbrev=BXS,ConditionBrowseBranchAbbrev=BC06,ConditionBrowseBranchAbbrev=BC19,ConditionBrowseBranchAbbrev=All,ConditionBrowseBranchAbbrev=BC17,ConditionBrowseBranchAbbrev=BC20,ConditionBrowseBranchAbbrev=BC01,ConditionBrowseBranchAbbrev=BC14,ConditionBrowseBranchAbbrev=BC26,ConditionBrowseBranchAbbrev=BC23,ConditionBrowseBranchAbbrev=BC04,ConditionBrowseBranchAbbrev=BC11,ConditionBrowseBranchAbbrev=BC25,ConditionBrowseBranchAbbrev=BXM,ConditionBrowseBranchAbbrev=BC08,ConditionBrowseBranchAbbrev=BC21,ConditionBrowseBranchAbbrev=BC10,ConditionBrowseBranchAbbrev=BC07,ConditionBrowseBranchAbbrev=BC09,ConditionBrowseBranchAbbrev=Rare,ConditionBrowseBranchAbbrev=BC02,InterventionType=Behavioral,InterventionType=Radiation,InterventionType=Device,InterventionType=Genetic,InterventionType=Combination Product,InterventionType=Diagnostic Test,InterventionType=Biological,InterventionType=Drug,InterventionType=Procedure,InterventionType=Other,InterventionType=Dietary Supplement,ArmGroupType=Placebo Comparator,ArmGroupType=No Intervention,ArmGroupType=Sham Comparator,ArmGroupType=Active Comparator,ArmGroupType=Other,ArmGroupType=Experimental,BaselineDenomCountGroupId=BG029,BaselineDenomCountGroupId=BG018,BaselineDenomCountGroupId=BG012,BaselineDenomCountGroupId=BG028,BaselineDenomCountGroupId=BG013,BaselineDenomCountGroupId=BG020,BaselineDenomCountGroupId=BG022,BaselineDenomCountGroupId=BG009,BaselineDenomCountGroupId=BG031,BaselineDenomCountGroupId=BG006,BaselineDenomCountGroupId=BG000,BaselineDenomCountGroupId=BG014,BaselineDenomCountGroupId=BG007,BaselineDenomCountGroupId=BG017,BaselineDenomCountGroupId=BG005,BaselineDenomCountGroupId=BG008,BaselineDenomCountGroupId=BG016,BaselineDenomCountGroupId=BG030,BaselineDenomCountGroupId=BG026,BaselineDenomCountGroupId=BG021,BaselineDenomCountGroupId=BG025,BaselineDenomCountGroupId=BG019,BaselineDenomCountGroupId=BG003,BaselineDenomCountGroupId=BG027,BaselineDenomCountGroupId=BG024,BaselineDenomCountGroupId=BG001,BaselineDenomCountGroupId=BG010,BaselineDenomCountGroupId=BG004,BaselineDenomCountGroupId=BG002,BaselineDenomCountGroupId=BG032,BaselineDenomCountGroupId=BG023,BaselineDenomCountGroupId=BG015,BaselineDenomCountGroupId=BG011,BaselineMeasureDispersionType=Full Range,BaselineMeasureDispersionType=Inter-Quartile Range,BaselineMeasureDispersionType=Standard Deviation,LocationCountry_top=Finland,LocationCountry_top=Norway,LocationCountry_top=Japan,"LocationCountry_top=Korea, Republic of",LocationCountry_top=Peru,LocationCountry_top=Bulgaria,LocationCountry_top=Philippines,LocationCountry_top=Slovakia,LocationCountry_top=Austria,LocationCountry_top=Argentina,LocationCountry_top=Russian Federation,LocationCountry_top=South Africa,LocationCountry_top=Czech Republic,LocationCountry_top=United Kingdom,LocationCountry_top=Hungary,LocationCountry_top=Serbia,LocationCountry_top=New Zealand,LocationCountry_top=Greece,LocationCountry_top=China,LocationCountry_top=Portugal,LocationCountry_top=Sweden,LocationCountry_top=Latvia,LocationCountry_top=Egypt,LocationCountry_top=Puerto Rico,LocationCountry_top=Thailand,LocationCountry_top=Netherlands,LocationCountry_top=India,LocationCountry_top=Australia,LocationCountry_top=Brazil,LocationCountry_top=Mexico,LocationCountry_top=Ukraine,LocationCountry_top=Romania,LocationCountry_top=Estonia,LocationCountry_top=Hong Kong,LocationCountry_top=Belgium,LocationCountry_top=Canada,LocationCountry_top=Spain,LocationCountry_top=Turkey,LocationCountry_top=Italy,LocationCountry_top=Chile,LocationCountry_top=Colombia,LocationCountry_top=United States,LocationCountry_top=France,LocationCountry_top=Singapore,LocationCountry_top=Israel,LocationCountry_top=Poland,LocationCountry_top=Switzerland,LocationCountry_top=Denmark,LocationCountry_top=Taiwan,LocationCountry_top=Germany,LocationCity_top=Richmond,LocationCity_top=Portland,LocationCity_top=Saint Louis,LocationCity_top=Rochester,LocationCity_top=Cincinnati,LocationCity_top=Durham,LocationCity_top=Kansas City,LocationCity_top=San Antonio,LocationCity_top=Ann Arbor,LocationCity_top=Jacksonville,LocationCity_top=Minneapolis,LocationCity_top=Madrid,LocationCity_top=Houston,LocationCity_top=Seattle,LocationCity_top=San Diego,LocationCity_top=Barcelona,LocationCity_top=Paris,LocationCity_top=Cleveland,LocationCity_top=Miami,LocationCity_top=Detroit,LocationCity_top=Charleston,LocationCity_top=Omaha,LocationCity_top=Indianapolis,LocationCity_top=Boston,LocationCity_top=Dallas,LocationCity_top=Toronto,LocationCity_top=Berlin,LocationCity_top=San Francisco,LocationCity_top=Las Vegas,LocationCity_top=Montreal,LocationCity_top=Columbus,LocationCity_top=London,LocationCity_top=Denver,LocationCity_top=Seoul,LocationCity_top=Birmingham,LocationCity_top=Tampa,LocationCity_top=Philadelphia,LocationCity_top=Chicago,LocationCity_top=Nashville,LocationCity_top=Pittsburgh,LocationCity_top=Atlanta,LocationCity_top=Austin,LocationCity_top=Moscow,LocationCity_top=Phoenix,LocationCity_top=Los Angeles,LocationCity_top=Salt Lake City,LocationCity_top=Oklahoma City,LocationCity_top=New York,LocationCity_top=Baltimore,LocationCity_top=Orlando,LocationFacility_top=Sanofi-Aventis Administrative Office,LocationFacility_top=Rush University Medical Center,LocationFacility_top=Memorial Sloan-Kettering Cancer Center,LocationFacility_top=University of Kansas Medical Center,LocationFacility_top=Brigham and Women's Hospital,LocationFacility_top=University of Michigan,LocationFacility_top=University of Chicago,LocationFacility_top=University of Pennsylvania,LocationFacility_top=Memorial Sloan Kettering Cancer Center,LocationFacility_top=University of Mississippi Medical Center,LocationFacility_top=Roswell Park Cancer Institute,LocationFacility_top=Samsung Medical Center,LocationFacility_top=Novartis Investigative Site,LocationFacility_top=University of Michigan Comprehensive Cancer Center,LocationFacility_top=University of Minnesota,LocationFacility_top=Cleveland Clinic,LocationFacility_top=University of Alabama at Birmingham,LocationFacility_top=Mayo Clinic Cancer Center,LocationFacility_top=University of Nebraska Medical Center,LocationFacility_top=Research Site,LocationFacility_top=Local Institution,LocationFacility_top=University of Texas MD Anderson Cancer Center,LocationFacility_top=Fox Chase Cancer Center,LocationFacility_top=CCOP - Wichita,LocationFacility_top=Northwestern University,LocationFacility_top=Novo Nordisk Investigational Site,LocationFacility_top=Children's Hospital of Philadelphia,LocationFacility_top=Columbia University Medical Center,LocationFacility_top=Mayo Clinic,"LocationFacility_top=National Institutes of Health Clinical Center, 9000 Rockville Pike",LocationFacility_top=Vanderbilt University Medical Center,LocationFacility_top=Emory University,LocationFacility_top=Duke University Medical Center,LocationFacility_top=University of Florida,LocationFacility_top=CCOP - Montana Cancer Consortium,LocationFacility_top=Asan Medical Center,LocationFacility_top=GSK Investigational Site,LocationFacility_top=Princess Margaret Hospital,LocationFacility_top=Henry Ford Hospital,LocationFacility_top=Dana-Farber Cancer Institute,LocationFacility_top=Hurley Medical Center,LocationFacility_top=Pfizer Investigational Site,LocationFacility_top=Baylor College of Medicine,LocationFacility_top=Beth Israel Deaconess Medical Center,LocationFacility_top=Fred Hutchinson Cancer Research Center,LocationFacility_top=Medical University of South Carolina,LocationFacility_top=Johns Hopkins University,LocationFacility_top=Seoul National University Hospital,LocationFacility_top=Washington University School of Medicine,LocationFacility_top=Massachusetts General Hospital,Condition_top=Myocardial Infarction,Condition_top=Type 2 Diabetes Mellitus,Condition_top=Acne Vulgaris,Condition_top=Gastric Cancer,Condition_top=Pain,Condition_top=Multiple Sclerosis,Condition_top=Breast Neoplasms,Condition_top=Type 1 Diabetes,Condition_top=Alzheimer Disease,"Condition_top=Diabetes Mellitus, Type 2",Condition_top=Obesity,"Condition_top=Unspecified Adult Solid Tumor, Protocol Specific",Condition_top=Head and Neck Cancer,"Condition_top=Diabetes Mellitus, Type 1",Condition_top=HIV Infection,Condition_top=Infertility,Condition_top=Lymphoma,Condition_top=Colorectal Cancer,Condition_top=Cystic Fibrosis,Condition_top=Anemia,Condition_top=Breast Cancer,Condition_top=Brain and Central Nervous System Tumors,Condition_top=Hepatocellular Carcinoma,Condition_top=Alcohol Dependence,Condition_top=Parkinson Disease,Condition_top=HIV-1 Infection,Condition_top=Actinic Keratosis,Condition_top=Heart Failure,Condition_top=Prostate Cancer,Condition_top=Acute Lymphoblastic Leukemia,Condition_top=Bipolar Disorder,Condition_top=HIV Infections,Condition_top=Amyotrophic Lateral Sclerosis,Condition_top=Parkinson's Disease,Condition_top=Chronic Obstructive Pulmonary Disease,Condition_top=Acute Myeloid Leukemia,Condition_top=Osteoporosis,Condition_top=Chronic Hepatitis C,Condition_top=Glioblastoma,Condition_top=Hepatitis C,Condition_top=Schizophrenia,Condition_top=Coronary Artery Disease,Condition_top=Diabetes Mellitus,"Condition_top=Carcinoma, Non-Small-Cell Lung",Condition_top=Ulcerative Colitis,Condition_top=Diabetic Macular Edema,Condition_top=Type 2 Diabetes,Condition_top=Cocaine Dependence,Condition_top=Chronic Kidney Disease,Condition_top=Glioblastoma Multiforme,Condition_top=Chronic Obstructive Pulmonary Disease (COPD),Condition_top=Smoking Cessation,Condition_top=Metastatic Colorectal Cancer,Condition_top=Epilepsy,Condition_top=Postoperative Pain,Condition_top=Melanoma,Condition_top=Stroke,Condition_top=Ocular Hypertension,Condition_top=Atopic Dermatitis,Condition_top=Non-Hodgkin's Lymphoma,Condition_top=Metastatic Breast Cancer,Condition_top=Multiple Myeloma and Plasma Cell Neoplasm,Condition_top=Alcoholism,Condition_top=Osteoarthritis,Condition_top=Healthy Volunteers,Condition_top=Leukemia,Condition_top=Multiple Myeloma,Condition_top=Crohn's Disease,Condition_top=Myelodysplastic Syndromes,Condition_top=Solid Tumors,Condition_top=Sarcoma,Condition_top=Alzheimer's Disease,Condition_top=Tuberculosis,Condition_top=Non-Small Cell Lung Cancer,Condition_top=Pancreatic Cancer,Condition_top=Fallopian Tube Cancer,Condition_top=Depression,Condition_top=Non Small Cell Lung Cancer,Condition_top=Bladder Cancer,Condition_top=Asthma,Condition_top=Non-small Cell Lung Cancer,Condition_top=Hypertension,Condition_top=Esophageal Cancer,Condition_top=Ovarian Cancer,"Condition_top=Pulmonary Disease, Chronic Obstructive",Condition_top=Rheumatoid Arthritis,Condition_top=Chronic Lymphocytic Leukemia,Condition_top=Malaria,Condition_top=Hypercholesterolemia,Condition_top=Lung Cancer,Condition_top=Healthy,Condition_top=Major Depressive Disorder,Condition_top=Kidney Cancer,Condition_top=Liver Cancer,Condition_top=Psoriasis,Condition_top=HIV,Condition_top=Cancer,Condition_top=Atrial Fibrillation,Condition_top=Diabetes,Condition_top=Myelodysplastic Syndrome,ConditionAncestorTerm_top=Neurodegenerative Diseases,ConditionAncestorTerm_top=Respiratory Tract Neoplasms,"ConditionAncestorTerm_top=Hepatitis, Viral, Human",ConditionAncestorTerm_top=Glucose Metabolism Disorders,ConditionAncestorTerm_top=Kidney Diseases,ConditionAncestorTerm_top=Intestinal Diseases,ConditionAncestorTerm_top=Pathologic Processes,ConditionAncestorTerm_top=Pain,ConditionAncestorTerm_top=Chemically-Induced Disorders,ConditionAncestorTerm_top=Colonic Diseases,ConditionAncestorTerm_top=Hematologic Diseases,ConditionAncestorTerm_top=Picornaviridae Infections,ConditionAncestorTerm_top=Digestive System Neoplasms,ConditionAncestorTerm_top=RNA Virus Infections,"ConditionAncestorTerm_top=Carcinoma, Bronchogenic",ConditionAncestorTerm_top=Gastrointestinal Diseases,ConditionAncestorTerm_top=Lentivirus Infections,"ConditionAncestorTerm_top=Hypersensitivity, Immediate",ConditionAncestorTerm_top=Rectal Diseases,ConditionAncestorTerm_top=Virus Diseases,ConditionAncestorTerm_top=Bone Marrow Diseases,"ConditionAncestorTerm_top=Genital Diseases, Female",ConditionAncestorTerm_top=Endocrine Gland Neoplasms,ConditionAncestorTerm_top=Immunologic Deficiency Syndromes,ConditionAncestorTerm_top=Hemorrhagic Disorders,ConditionAncestorTerm_top=Immunoproliferative Disorders,ConditionAncestorTerm_top=Lung Diseases,ConditionAncestorTerm_top=Gastrointestinal Neoplasms,ConditionAncestorTerm_top=Prostatic Diseases,ConditionAncestorTerm_top=Liver Diseases,ConditionAncestorTerm_top=Sexually Transmitted Diseases,"ConditionAncestorTerm_top=Sexually Transmitted Diseases, Viral",ConditionAncestorTerm_top=Intestinal Neoplasms,ConditionAncestorTerm_top=Flaviviridae Infections,ConditionAncestorTerm_top=Disease,ConditionAncestorTerm_top=Neurologic Manifestations,ConditionAncestorTerm_top=Cardiovascular Diseases,"ConditionAncestorTerm_top=Lung Diseases, Obstructive",ConditionAncestorTerm_top=Breast Diseases,"ConditionAncestorTerm_top=Neoplasms, Neuroepithelial",ConditionAncestorTerm_top=Retroviridae Infections,ConditionAncestorTerm_top=Neuromuscular Diseases,"ConditionAncestorTerm_top=Signs and Symptoms, Digestive",ConditionAncestorTerm_top=Diabetes Mellitus,ConditionAncestorTerm_top=Metabolic Diseases,ConditionAncestorTerm_top=Neoplasms by Histologic Type,"ConditionAncestorTerm_top=Neoplasms, Glandular and Epithelial",ConditionAncestorTerm_top=Urologic Diseases,ConditionAncestorTerm_top=Carcinoma,ConditionAncestorTerm_top=Otorhinolaryngologic Diseases,"ConditionAncestorTerm_top=Genital Diseases, Male",ConditionAncestorTerm_top=Connective Tissue Diseases,"ConditionAncestorTerm_top=Neoplasms, Germ Cell and Embryonal",ConditionAncestorTerm_top=Precancerous Conditions,ConditionAncestorTerm_top=Respiratory Hypersensitivity,ConditionAncestorTerm_top=Central Nervous System Diseases,ConditionAncestorTerm_top=Adenocarcinoma,ConditionAncestorTerm_top=Gonadal Disorders,ConditionAncestorTerm_top=Signs and Symptoms,ConditionAncestorTerm_top=Substance-Related Disorders,ConditionAncestorTerm_top=Nervous System Diseases,ConditionAncestorTerm_top=Eye Diseases,ConditionAncestorTerm_top=Endocrine System Diseases,ConditionAncestorTerm_top=Urogenital Neoplasms,ConditionAncestorTerm_top=Brain Diseases,"ConditionAncestorTerm_top=Genital Neoplasms, Female",ConditionAncestorTerm_top=Neuroectodermal Tumors,ConditionAncestorTerm_top=Lymphoproliferative Disorders,ConditionAncestorTerm_top=Respiratory Tract Diseases,ConditionAncestorTerm_top=Paraproteinemias,"ConditionAncestorTerm_top=Genital Neoplasms, Male",ConditionAncestorTerm_top=Pancreatic Diseases,ConditionAncestorTerm_top=Mental Disorders,ConditionAncestorTerm_top=Hypersensitivity,ConditionAncestorTerm_top=Bronchial Diseases,ConditionAncestorTerm_top=Blood Protein Disorders,ConditionAncestorTerm_top=Bone Diseases,ConditionAncestorTerm_top=Musculoskeletal Diseases,ConditionAncestorTerm_top=Thoracic Neoplasms,ConditionAncestorTerm_top=Vascular Diseases,ConditionAncestorTerm_top=Joint Diseases,ConditionAncestorTerm_top=Gastroenteritis,ConditionAncestorTerm_top=Behavioral Symptoms,ConditionAncestorTerm_top=Autoimmune Diseases,ConditionAncestorTerm_top=Rheumatic Diseases,ConditionAncestorTerm_top=Heart Diseases,ConditionAncestorTerm_top=Neoplasms,ConditionAncestorTerm_top=Neurocognitive Disorders,"ConditionAncestorTerm_top=Neoplasms, Nerve Tissue",ConditionAncestorTerm_top=Bronchial Neoplasms,ConditionAncestorTerm_top=Lymphatic Diseases,"ConditionAncestorTerm_top=Genetic Diseases, Inborn",ConditionAncestorTerm_top=Neoplasms by Site,ConditionAncestorTerm_top=Stomatognathic Diseases,ConditionAncestorTerm_top=Mood Disorders,ConditionAncestorTerm_top=Digestive System Diseases,ConditionAncestorTerm_top=Skin Diseases,ConditionAncestorTerm_top=Respiratory Tract Infections,ConditionAncestorTerm_top=Hemostatic Disorders,ConditionAncestorTerm_top=Immune System Diseases,Keyword_top=stage IV prostate cancer,Keyword_top=Treatment Naive,Keyword_top=stage III chronic lymphocytic leukemia,Keyword_top=efficacy,Keyword_top=Pain,Keyword_top=recurrent grade 1 follicular lymphoma,Keyword_top=recurrent marginal zone lymphoma,Keyword_top=pharmacokinetics,Keyword_top=stage IIIA breast cancer,Keyword_top=recurrent breast cancer,Keyword_top=depression,Keyword_top=Treatment Experienced,Keyword_top=stage IV chronic lymphocytic leukemia,Keyword_top=Obesity,Keyword_top=adult giant cell glioblastoma,Keyword_top=stage IIIB non-small cell lung cancer,Keyword_top=diabetes,Keyword_top=recurrent grade 2 follicular lymphoma,Keyword_top=Lymphoma,Keyword_top=secondary myelodysplastic syndromes,Keyword_top=Safety,Keyword_top=Breast Cancer,Keyword_top=adult gliosarcoma,Keyword_top=stage III adult diffuse large cell lymphoma,Keyword_top=stage IV non-small cell lung cancer,Keyword_top=children,Keyword_top=Prostate Cancer,Keyword_top=recurrent non-small cell lung cancer,Keyword_top=metastatic,Keyword_top=refractory chronic lymphocytic leukemia,Keyword_top=Avastin,Keyword_top=Cisplatin,Keyword_top=stage II multiple myeloma,Keyword_top=secondary acute myeloid leukemia,Keyword_top=Efficacy,Keyword_top=cancer,Keyword_top=CLL,Keyword_top=MDS,Keyword_top=recurrent small lymphocytic lymphoma,Keyword_top=asthma,Keyword_top=chemotherapy,Keyword_top=Hepatitis C,Keyword_top=Schizophrenia,Keyword_top=splenic marginal zone lymphoma,Keyword_top=stage IV breast cancer,Keyword_top=Dexamethasone,Keyword_top=Inflammation,Keyword_top=Phase II,Keyword_top=stage IIIC breast cancer,Keyword_top=Breast cancer,Keyword_top=Cyclophosphamide,Keyword_top=recurrent adult acute myeloid leukemia,Keyword_top=Epilepsy,Keyword_top=adult glioblastoma,Keyword_top=Pharmacokinetics,Keyword_top=recurrent adult diffuse small cleaved cell lymphoma,Keyword_top=refractory multiple myeloma,Keyword_top=NSCLC,Keyword_top=previously treated myelodysplastic syndromes,Keyword_top=Leukemia,Keyword_top=Multiple Myeloma,Keyword_top=Immunotherapy,Keyword_top=AML,Keyword_top=stage III multiple myeloma,Keyword_top=recurrent adult immunoblastic large cell lymphoma,Keyword_top=recurrent adult diffuse large cell lymphoma,Keyword_top=Rituximab,Keyword_top=COPD,Keyword_top=recurrent mantle cell lymphoma,Keyword_top=Chemotherapy,Keyword_top=Children,Keyword_top=stage II breast cancer,Keyword_top=recurrent adult diffuse mixed cell lymphoma,Keyword_top=de novo myelodysplastic syndromes,Keyword_top=extranodal marginal zone B-cell lymphoma of mucosa-associated lymphoid tissue,Keyword_top=adenocarcinoma of the prostate,Keyword_top=Refractory,Keyword_top=Depression,Keyword_top=stage IIIB breast cancer,Keyword_top=Treatment,Keyword_top=Asthma,Keyword_top=pain,Keyword_top=Hypertension,Keyword_top=breast cancer,Keyword_top=stage IV colon cancer,Keyword_top=Rheumatoid Arthritis,Keyword_top=treatment,Keyword_top=recurrent grade 3 follicular lymphoma,Keyword_top=nodal marginal zone B-cell lymphoma,Keyword_top=Major Depressive Disorder,Keyword_top=safety,Keyword_top=stage IV adult diffuse large cell lymphoma,Keyword_top=Psoriasis,Keyword_top=Metastatic,Keyword_top=HIV,Keyword_top=Cancer,"Keyword_top=unspecified adult solid tumor, protocol specific",Keyword_top=Bevacizumab,Keyword_top=recurrent prostate cancer,Keyword_top=Diabetes,StartMonth,StartYear,MaximumAgeValue,MinimumAgeValue,#DiffCondition,#DiffConditionAncestorTerm,#DiffCollaboratorClass,#EligiCriteria,#DiffLocationFacility,#DiffLocationCity,#DiffLocationCountry,#DiffArmGroupLabel,InterventionName_placebo,InterventionName_therapy,InterventionName_hydrochloride,InterventionName_dose,InterventionName_cyclophosphamide,InterventionName_paclitaxel,InterventionName_cisplatin,InterventionName_insulin,InterventionName_acid,InterventionName_docetaxel,InterventionName_gemcitabine,InterventionName_analysis,InterventionName_bevacizumab,InterventionName_oral,InterventionName_solution,InterventionName_carboplatin,InterventionName_sodium,InterventionName_radiation,InterventionName_gel,InterventionName_laboratory,OrgFullName_university,OrgFullName_center,OrgFullName_cancer,OrgFullName_inc,OrgFullName_hospital,OrgFullName_institute,OrgFullName_national,OrgFullName_medical,OrgFullName_health,OrgFullName_pharmaceuticals,OrgFullName_research,OrgFullName_clinical,OrgFullName_novartis,OrgFullName_ltd,OrgFullName_group,OrgFullName_glaxosmithkline,OrgFullName_pharma,OrgFullName_pfizer,OrgFullName_de,OrgFullName_astrazeneca,LeadSponsorName_university,LeadSponsorName_inc,LeadSponsorName_cancer,LeadSponsorName_institute,LeadSponsorName_pharmaceuticals,LeadSponsorName_center,LeadSponsorName_hospital,LeadSponsorName_national,LeadSponsorName_medical,LeadSponsorName_research,LeadSponsorName_health,LeadSponsorName_novartis,LeadSponsorName_ltd,LeadSponsorName_group,LeadSponsorName_pfizer,LeadSponsorName_de,LeadSponsorName_pharma,LeadSponsorName_glaxosmithkline,LeadSponsorName_therapeutics,LeadSponsorName_astrazeneca,CollaboratorName_national,CollaboratorName_institute,CollaboratorName_university,CollaboratorName_cancer,CollaboratorName_hospital,CollaboratorName_nci,CollaboratorName_research,CollaboratorName_health,CollaboratorName_inc,CollaboratorName_medical,CollaboratorName_center,CollaboratorName_pharmaceuticals,CollaboratorName_foundation,CollaboratorName_de,CollaboratorName_group,CollaboratorName_genentech,CollaboratorName_ltd,CollaboratorName_abuse,CollaboratorName_novartis,CollaboratorName_company,EligibilityCriteria_study,EligibilityCriteria_prior,EligibilityCriteria_criteria,EligibilityCriteria_patients,EligibilityCriteria_within,EligibilityCriteria_disease,EligibilityCriteria_history,EligibilityCriteria_must,EligibilityCriteria_treatment,EligibilityCriteria_months,EligibilityCriteria_therapy,EligibilityCriteria_least,EligibilityCriteria_inclusion,EligibilityCriteria_screening,EligibilityCriteria_use,EligibilityCriteria_days,EligibilityCriteria_exclusion,EligibilityCriteria_weeks,EligibilityCriteria_patient,ArmGroupDescription_day,ArmGroupDescription_dose,ArmGroupDescription_days,ArmGroupDescription_weeks,ArmGroupDescription_daily,ArmGroupDescription_placebo,ArmGroupDescription_patients,ArmGroupDescription_receive,ArmGroupDescription_treatment,ArmGroupDescription_week,ArmGroupDescription_participants,ArmGroupDescription_iv,ArmGroupDescription_every,ArmGroupDescription_administered,ArmGroupDescription_study,ArmGroupDescription_subjects,ArmGroupDescription_oral,ArmGroupDescription_twice,ArmGroupDescription_period,ArmGroupDescription_followed,ArmGroupInterventionName_drug,ArmGroupInterventionName_placebo,ArmGroupInterventionName_biological,ArmGroupInterventionName_procedure,ArmGroupInterventionName_radiation,ArmGroupInterventionName_therapy,ArmGroupInterventionName_dose,ArmGroupInterventionName_hydrochloride,ArmGroupInterventionName_cyclophosphamide,ArmGroupInterventionName_paclitaxel,ArmGroupInterventionName_behavioral,ArmGroupInterventionName_acid,ArmGroupInterventionName_analysis,ArmGroupInterventionName_insulin,ArmGroupInterventionName_oral,ArmGroupInterventionName_cisplatin,ArmGroupInterventionName_laboratory,ArmGroupInterventionName_biomarker,ArmGroupInterventionName_docetaxel,ArmGroupInterventionName_device,ArmGroupLabel_placebo,ArmGroupLabel_arm,ArmGroupLabel_group,ArmGroupLabel_dose,ArmGroupLabel_treatment,ArmGroupLabel_cohort,ArmGroupLabel_part,ArmGroupLabel_plus,ArmGroupLabel_c,ArmGroupLabel_control,ArmGroupLabel_high,ArmGroupLabel_phase,ArmGroupLabel_low,ArmGroupLabel_mcg,ArmGroupLabel_day,ArmGroupLabel_ii,ArmGroupLabel_bid,ArmGroupLabel_therapy,ArmGroupLabel_vehicle,ArmGroupLabel_daily,EventsTimeFrame_study,EventsTimeFrame_days,EventsTimeFrame_adverse,EventsTimeFrame_events,EventsTimeFrame_treatment,EventsTimeFrame_weeks,EventsTimeFrame_collected,EventsTimeFrame_dose,EventsTimeFrame_last,EventsTimeFrame_day,EventsTimeFrame_months,EventsTimeFrame_first,EventsTimeFrame_week,EventsTimeFrame_period,EventsTimeFrame_drug,EventsTimeFrame_visit,EventsTimeFrame_aes,EventsTimeFrame_time,EventsTimeFrame_end,FlowDropWithdrawType_subject,FlowDropWithdrawType_withdrawal,FlowDropWithdrawType_event,FlowDropWithdrawType_adverse,FlowDropWithdrawType_follow,FlowDropWithdrawType_lost,FlowDropWithdrawType_protocol,FlowDropWithdrawType_violation,FlowDropWithdrawType_death,FlowDropWithdrawType_lack,FlowDropWithdrawType_decision,FlowDropWithdrawType_efficacy,FlowDropWithdrawType_physician,FlowDropWithdrawType_study,FlowDropWithdrawType_disease,FlowDropWithdrawType_criteria,FlowDropWithdrawType_non,FlowDropWithdrawType_compliance,FlowDropWithdrawType_progression,FlowDropWithdrawType_treatment,FlowGroupDescription_day,FlowGroupDescription_dose,FlowGroupDescription_daily,FlowGroupDescription_days,FlowGroupDescription_weeks,FlowGroupDescription_placebo,FlowGroupDescription_participants,FlowGroupDescription_treatment,FlowGroupDescription_received,FlowGroupDescription_week,FlowGroupDescription_iv,FlowGroupDescription_patients,FlowGroupDescription_administered,FlowGroupDescription_period,FlowGroupDescription_study,FlowGroupDescription_every,FlowGroupDescription_receive,FlowGroupDescription_orally,FlowGroupDescription_oral,FlowGroupDescription_twice,FlowGroupTitle_placebo,FlowGroupTitle_dose,FlowGroupTitle_arm,FlowGroupTitle_group,FlowGroupTitle_cohort,FlowGroupTitle_phase,FlowGroupTitle_treatment,FlowGroupTitle_part,FlowGroupTitle_bid,FlowGroupTitle_day,FlowGroupTitle_first,FlowGroupTitle_plus,FlowGroupTitle_mcg,FlowGroupTitle_qd,FlowGroupTitle_sequence,FlowGroupTitle_ii,FlowGroupTitle_c,FlowGroupTitle_daily,FlowGroupTitle_level,FlowGroupTitle_control,FlowMilestoneType_completed,FlowMilestoneType_started,FlowMilestoneType_received,FlowMilestoneType_treated,FlowMilestoneType_population,FlowMilestoneType_treatment,FlowMilestoneType_set,FlowMilestoneType_safety,FlowMilestoneType_analysis,FlowMilestoneType_study,FlowMilestoneType_drug,FlowMilestoneType_week,FlowMilestoneType_treat,FlowMilestoneType_dose,FlowMilestoneType_eligible,FlowMilestoneType_full,FlowMilestoneType_intent,FlowMilestoneType_itt,FlowMilestoneType_protocol,FlowMilestoneType_least,FlowPeriodTitle_study,FlowPeriodTitle_overall,FlowPeriodTitle_period,FlowPeriodTitle_phase,FlowPeriodTitle_treatment,FlowPeriodTitle_weeks,FlowPeriodTitle_week,FlowPeriodTitle_intervention,FlowPeriodTitle_part,FlowPeriodTitle_days,FlowPeriodTitle_washout,FlowPeriodTitle_blind,FlowPeriodTitle_double,FlowPeriodTitle_follow,FlowPeriodTitle_open,FlowPeriodTitle_label,FlowPeriodTitle_extension,FlowPeriodTitle_day,FlowPeriodTitle_first,FlowPeriodTitle_second,FlowRecruitmentDetails_study,FlowRecruitmentDetails_participants,FlowRecruitmentDetails_patients,FlowRecruitmentDetails_enrolled,FlowRecruitmentDetails_sites,FlowRecruitmentDetails_recruited,FlowRecruitmentDetails_subjects,FlowRecruitmentDetails_conducted,FlowRecruitmentDetails_centers,FlowRecruitmentDetails_recruitment,FlowRecruitmentDetails_united,FlowRecruitmentDetails_randomized,FlowRecruitmentDetails_last,FlowRecruitmentDetails_period,FlowRecruitmentDetails_first,FlowRecruitmentDetails_treatment,FlowRecruitmentDetails_states,FlowRecruitmentDetails_total,FlowRecruitmentDetails_center,FlowRecruitmentDetails_patient,BaselineCategoryTitle_male,BaselineCategoryTitle_female,BaselineCategoryTitle_american,BaselineCategoryTitle_native,BaselineCategoryTitle_unknown,BaselineCategoryTitle_reported,BaselineCategoryTitle_hispanic,BaselineCategoryTitle_latino,BaselineCategoryTitle_white,BaselineCategoryTitle_asian,BaselineCategoryTitle_black,BaselineCategoryTitle_african,BaselineCategoryTitle_indian,BaselineCategoryTitle_alaska,BaselineCategoryTitle_pacific,BaselineCategoryTitle_islander,BaselineCategoryTitle_hawaiian,BaselineCategoryTitle_race,BaselineCategoryTitle_one,BaselineClassTitle_united,BaselineClassTitle_states,BaselineClassTitle_asian,BaselineClassTitle_white,BaselineClassTitle_american,BaselineClassTitle_african,BaselineClassTitle_black,BaselineClassTitle_hispanic,BaselineClassTitle_native,BaselineClassTitle_heritage,BaselineClassTitle_indian,BaselineClassTitle_canada,BaselineClassTitle_caucasian,BaselineClassTitle_latino,BaselineClassTitle_pacific,BaselineClassTitle_south,BaselineClassTitle_islander,BaselineClassTitle_race,BaselineGroupDescription_day,BaselineGroupDescription_dose,BaselineGroupDescription_daily,BaselineGroupDescription_weeks,BaselineGroupDescription_days,BaselineGroupDescription_total,BaselineGroupDescription_placebo,BaselineGroupDescription_participants,BaselineGroupDescription_groups,BaselineGroupDescription_reporting,BaselineGroupDescription_treatment,BaselineGroupDescription_received,BaselineGroupDescription_week,BaselineGroupDescription_iv,BaselineGroupDescription_patients,BaselineGroupDescription_administered,BaselineGroupDescription_receive,BaselineGroupDescription_every,BaselineGroupDescription_study,BaselineGroupDescription_orally,BaselineGroupTitle_total,BaselineGroupTitle_placebo,BaselineGroupTitle_arm,BaselineGroupTitle_dose,BaselineGroupTitle_group,BaselineGroupTitle_cohort,BaselineGroupTitle_phase,BaselineGroupTitle_treatment,BaselineGroupTitle_part,BaselineGroupTitle_bid,BaselineGroupTitle_day,BaselineGroupTitle_plus,BaselineGroupTitle_ii,BaselineGroupTitle_qd,BaselineGroupTitle_study,BaselineGroupTitle_c,BaselineGroupTitle_daily,BaselineGroupTitle_participants,BaselineGroupTitle_control,BaselineGroupTitle_therapy,BaselineMeasureTitle_age,BaselineMeasureTitle_sex,BaselineMeasureTitle_female,BaselineMeasureTitle_male,BaselineMeasureTitle_continuous,BaselineMeasureTitle_enrollment,BaselineMeasureTitle_region,BaselineMeasureTitle_race,BaselineMeasureTitle_nih,BaselineMeasureTitle_omb,BaselineMeasureTitle_ethnicity,BaselineMeasureTitle_customized,BaselineMeasureTitle_categorical,BaselineMeasureTitle_score,BaselineMeasureTitle_status,BaselineMeasureTitle_baseline,BaselineMeasureTitle_index,BaselineMeasureTitle_body,BaselineMeasureTitle_weight,BaselineMeasureTitle_gender,BaselineMeasureUnitOfMeasure_participants,BaselineMeasureUnitOfMeasure_scale,BaselineMeasureUnitOfMeasure_units,BaselineMeasureUnitOfMeasure_l,BaselineMeasureUnitOfMeasure_dl,BaselineMeasureUnitOfMeasure_ml,BaselineMeasureUnitOfMeasure_per,BaselineMeasureUnitOfMeasure_cm,BaselineMeasureUnitOfMeasure_percentage,BaselineMeasureUnitOfMeasure_score,BaselineMeasureUnitOfMeasure_mmol,BaselineMeasureUnitOfMeasure_mm,BaselineMeasureUnitOfMeasure_cells,BaselineMeasureUnitOfMeasure_months,BaselineMeasureUnitOfMeasure_days,BaselineMeasureUnitOfMeasure_kilograms,BaselineMeasureUnitOfMeasure_percent,BaselineMeasureUnitOfMeasure_kilogram,BaselineMeasureUnitOfMeasure_log,EnrollmentCount_new,MeshID1,MeshID2,MeshID3,AvgFacilityRank,AvgLocalAge,#Pts/#DiffLocationFacility,#Pts/#DiffLocationCountry,avg_population,min_population,max_population,main_country_population,avg_lifeExpectancy,min_lifeExpectancy,max_lifeExpectancy,main_country_lifeExpectancy,avg_GDP,min_GDP,max_GDP,main_country_GDP,avg_unemploymentRate,min_unemploymentRate,max_unemploymentRate,main_country_unemploymentRate,avg_hospitalBed,min_hospitalBed,max_hospitalBed,main_country_hospitalBed,avg_healthExpenditure,min_healthExpenditure,max_healthExpenditure,main_country_healthExpenditure,avg_density,min_density,max_density,main_country_density,avg_fertilityRate,min_fertilityRate,max_fertilityRate,main_country_fertilityRate,avg_medianAge,min_medianAge,max_medianAge,main_country_medianAge,avg_migrantsNet,min_migrantsNet,max_migrantsNet,main_country_migrantsNet,avg_sizeInKm2,min_sizeInKm2,max_sizeInKm2,main_country_sizeInKm2,avg_urbanPopulation,min_urbanPopulation,max_urbanPopulation,main_country_urbanPopulation,avg_worldshare,min_worldshare,max_worldshare,main_country_worldshare,avg_city_population,min_city_population,max_city_population
0,37,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1997.0,65.0,13.0,2,8,0,1213,19,19,19,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61.0,17726,12173,0,26,335482643,3.210526,61.0,331002651.0,331002651.0,331002651.0,331002651.0,78.0,78.0,78.0,78.0,20544340000000.0,20544340000000.0,20544340000000.0,20544340000000.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,3588.069664,158.630005,11377.091797
1,172,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1999.0,6.0,18.0,1,8,1,400,1,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,419.0,550,0,0,0,1378800,419.0,419.0,331002651.0,331002651.0,331002651.0,331002651.0,78.0,78.0,78.0,78.0,20544340000000.0,20544340000000.0,20544340000000.0,20544340000000.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,1717.821167,1717.821167,1717.821167
2,39,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1995.0,65.0,60.0,1,8,0,318,25,25,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120.0,544,0,0,945,162165319,4.8,120.0,331002651.0,331002651.0,331002651.0,331002651.0,78.0,78.0,78.0,78.0,20544340000000.0,20544340000000.0,20544340000000.0,20544340000000.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,2541.599437,61.538631,11377.091797
3,93,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1995.0,60.0,18.0,2,3,1,1477,1,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,111.0,4194,19966,19970,0,11720000,111.0,111.0,331002651.0,331002651.0,331002651.0,331002651.0,78.0,78.0,78.0,78.0,20544340000000.0,20544340000000.0,20544340000000.0,20544340000000.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,241.777069,241.777069,241.777069
4,123,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1995.0,45.0,18.0,3,3,1,903,2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8.0,0,0,0,0,23440000,4.0,8.0,331002651.0,331002651.0,331002651.0,331002651.0,78.0,78.0,78.0,78.0,20544340000000.0,20544340000000.0,20544340000000.0,20544340000000.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,7901.219238,7901.219238,7901.219238


# Common functions for preprocessing

Including:


*   Removing time outliers
*   Removing Enrollment Count Outliers
*   Add scalers
*   Feature selector / PCA



In [9]:
def removeTimeAndEnrollmentOutliers(model, df_train_X, df_train_y):
    """
    In this function, the outliers are removed by according numbers,
    the hypeopt will choose which one return the least loss.
    
    In time series split, it doesn't have impact on the end StartYear, since 
    the total df_train takes the StartYear until 2015.
    In random split method, the timeoutlier remover makes sense for the end StartYear.

    """
    
    train = pd.concat([df_train_X, df_train_y],axis=1)
    
    # Should we analyze some plot between year and mean/median of EnrollmentDuration in each year, for each Phase => Decide the time outlier?
    if model['timeoutlierremover'] == 1:
        instance = TimeOutlierRemover(2000, 2018)
        print('time1')        
    elif model['timeoutlierremover'] == 2:
        instance = TimeOutlierRemover(2005, 2018)
        print('time2')     
    elif model['timeoutlierremover'] == 3:
        instance = TimeOutlierRemover(2000, 2016)
        print('time3')
    elif model['timeoutlierremover'] == 4:
        instance = TimeOutlierRemover(2005, 2016)
        print('time4')
    else:
      instance = TimeOutlierRemover(1995, 2018)
      print('time0')

    
    train = instance.transform(train)

    if model['enrollmentoutlierremover'] != False:
        strategy = model['enrollmentoutlierremover']
        instance = EnrollmentOutlierRemover(strategy = strategy)
        train = instance.fit_transform(train)  
        print('count')

    train_y = train['EnrollmentDuration']
    train_X = train.drop(columns=['EnrollmentDuration'])

    del model['timeoutlierremover']
    del model['enrollmentoutlierremover']
    return train_X, train_y

def get_steps(model, val_X, train_X, train_y):
  steps = []
    

  """
  Scale has to be done before cutting down the dimensions using 'preproc_algo'. 
  The nested function allows putting another hp function inside.
  """
  if model['scale'] == 1:
      steps.append(('StandardScaler', StandardScaler()))
  elif model['scale'] == 2:
      steps.append(('MinMaxScaler', MinMaxScaler()))
  elif model['scale'] == 3:
      steps.append(('Normalizer', Normalizer()))
      
  if model['preproc_algo']['preproc_algo'] == 'PCA':
      whiten = model['preproc_algo']['whiten']
      steps.append(('PCA',  decomposition.PCA(whiten = whiten)))

    
  elif model['preproc_algo']['preproc_algo'] == 'feature_selector':
      instance = FeatureSelectorTransformer(target = train_y)
      train_X = instance.fit_transform(X = train_X)
      val_X = val_X[train_X.columns]
      print('fs')

  del model['preproc_algo']
  del model['scale']
  return steps, val_X, train_X

# For categorical encoding 
def feature_selector_cat(model, val_X, train_X, train_y):



  # Option steps: scale
  if model['scale'] == 1:
      sc = StandardScaler()
      scaled_train_X = sc.fit_transform(train_X)
      scaled_val_X = sc.transform (val_X)
      train_X = pd.DataFrame(scaled_train_X,index = train_X.index, columns = train_X.columns)
      val_X = pd.DataFrame(scaled_val_X,index = val_X.index, columns = val_X.columns)
      print('cat_scaler')

  elif model['scale'] == 2:
      sc = MinMaxScaler()
      scaled_train_X = sc.fit_transform(train_X)
      scaled_val_X = sc.transform (val_X)
      train_X = pd.DataFrame(scaled_train_X,index = train_X.index, columns = train_X.columns)
      val_X = pd.DataFrame(scaled_val_X,index = val_X.index, columns = val_X.columns)
      print('cat_minmaxscaler')

  elif model['scale'] == 3:
      sc = Normalizer()
      scaled_train_X = sc.fit_transform(train_X)
      scaled_val_X = sc.transform (val_X)
      train_X = pd.DataFrame(scaled_train_X,index = train_X.index, columns = train_X.columns)
      val_X = pd.DataFrame(scaled_val_X,index = val_X.index, columns = val_X.columns)
      print('cat_normalizer')

  if model['preproc_algo']['preproc_algo'] == 'feature_selector':
    instance = FeatureSelectorTransformer(target = train_y)
    train_X = instance.fit_transform(X = train_X)
    val_X = val_X[train_X.columns]
    print('fs_cat')
  del model['preproc_algo']
  del model['scale']
  return val_X, train_X

# Regression

## Create holdout set

In [10]:
#holdout set
#X = new_data.copy().drop(columns=['EnrollmentDuration'])
#Y = new_data.copy()['EnrollmentDuration']

#df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(X, Y, test_size=0.1, random_state=42)
#import re
#df_train_X = df_train_X.rename(columns = lambda x:re.sub(':', '', x))

In [11]:
#create time series hold out set

#Use Time Series Split
X = new_data.sort_values(by="StartYear").copy()

X=X[X["StartYear"]>1995]
X=X[X["StartYear"]<2018]

print(f"Size of dataset: {len(X)} records")

Y= X["EnrollmentDuration"]
X = X.drop("EnrollmentDuration", axis =1)

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y[0:train_size], Y[train_size:len(Y)]

#Info about the train and test set
print("Train Set Time Range: from ", df_train_X["StartYear"].min(), " to ", df_train_X["StartYear"].max())
print("Test Set Time Range: from ", df_test_X["StartYear"].min(), " to ", df_test_X["StartYear"].max(), "\n")
print("Median Duration Training set: ", df_train_y.median())
print("Median Duration Test set: ", df_test_y.median())




Size of dataset: 29258 records
Train Set Time Range: from  1996.0  to  2015.0
Test Set Time Range: from  2015.0  to  2017.0 

Median Duration Training set:  33.0
Median Duration Test set:  18.0


## Some parameters for searching space

In [None]:
# Available parameters for Random Forest: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rf_criterion = ["mse", "mae"]
rf_max_features = ["auto"]
# preprocessing options
whitens = [False, True]


# Available parameters for XGB: https://xgboost.readthedocs.io/en/latest/parameter.html
boosting_list_xgb = ['gbtree', 'dart', 'gblinear']
verbosity_xgb = ["1"] # For debug only
sampling_method_xgb = ["uniform", "gradient_based"]
tree_method_xgb = ["auto", "exact", "approx", "hist"] #"gpu_hist" => Check failed: gpu_predictor_: 
grow_policy_xgb = ["depthwise", "lossguide"]
objective_list_xgb = ['reg:linear', 'reg:gamma', 'reg:tweedie'] # for linear only
tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                'grow_policy' : {'grow_policy':'lossguide' ,
                                  'max_leaves': hp.quniform('max_leaves', 32, 100, 1)}}}]
                  
# Available parameter for LightGBM
boosting_list = ['dart', 'goss', 'gbdt'] 
objective_list = ['huber', 'gamma', 'fair', 'tweedie']
LGBM_MAX_DEPTH = 25
EVAL_METRIC_LGBM_REG = 'mae'
EVAL_METRIC_LGBM_CLASS = 'auc'

## Set Searching Spaces

In [None]:
# set searching space
space_params = {
    'model': hp.choice('model_type', [

    { # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html <=> Random Forest
        'type': 'rf',
        'n_estimators': hp.choice('n_estimators_rf', range(25, 150)),
        'max_depth': hp.choice('max_depth_rf', range(5, 100)),
        'min_samples_split': hp.choice('min_samples_split', range(2, 100)),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 100)),
        'max_leaf_nodes': hp.choice('max_leaf_nodes', range(10, 100)),
        'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0, 0.4, 0.1),
        'n_jobs': -1 ,  #-1 means using all processors,
        'random_state': 42,

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_rf', range(0,5)),
        'scale': hp.choice('scale_rf', range(0,4)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_rf', [False, "IQR", "MAD"]),
        'preproc_algo' :  hp.choice('preproc_algo_rf', [{'preproc_algo' : 'PCA', 'whiten':hp.choice('pca_whiten_rf', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}])
    }
    ,

    {   # https://xgboost.readthedocs.io/en/latest/python/python_api.html <=> XGBregressor
        'type': 'xgb',
        'n_estimators': hp.choice('n_estimators_xgb', range(1, 200)),
        'booster' : hp.choice('boosting_xgb', boosting_list_xgb),
        'learning_rate' : hp.loguniform('learning_rate_xgb', np.log(0.005), np.log(0.2)),
        'tree_method' : hp.choice('tree_method_xgb', tree_method),
        'gamma' : hp.uniform('gamma', 0.1, 3),
        'max_depth': hp.choice('max_depth_xgb', range(1, 40)),
        'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
        'max_delta_step': hp.choice('max_delta_step', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgb', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results
        'sampling_method': hp.choice('sampling_method', sampling_method_xgb),

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
        
        'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
        'random_state': 42,

        'objective' : hp.choice('objective', objective_list_xgb),
        'n_jobs': -1,
        'eval_metric' : 'mae',
     
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgb',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgb', [False, "IQR", "MAD"]),         
        'preproc_algo': hp.choice('preproc_algo_xgb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_xgb', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}]),
        'scale': hp.choice('scale_xgb',  range(0,4))
    }
    ,
    {   # https://xgboost.readthedocs.io/en/latest/python/python_api.html <=> XGBRFregressor
        'type': 'xgbRF',
        'max_depth': hp.choice('max_depth_xgbrf', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_xgbrf', np.log(0.005), np.log(0.2)),
        #'booster' : hp.choice('boosting_xgbrf', boosting_list_xgb),
        'tree_method' : hp.choice('tree_method_xgbrf', tree_method_xgb),
        'gamma' : hp.uniform('gamma_xgbrf', 0.1, 3),
        'min_child_weight' : hp.uniform('min_child_weight_xgbrf', 0, 5),
        'max_delta_step': hp.choice('max_delta_step_xgbrf', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgbrf', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree_xgbrf', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode_xgbrf', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel_xgbrf', 0.1, 1, 0.01),

        'reg_alpha' : hp.uniform('reg_alpha_xgbrf', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_xgbrf', 0, 5),
        'random_state': 42,
        'objective' : hp.choice('objective_xgbrf', objective_list_xgb),
        'n_jobs': -1,
        'eval_metric' : 'mae',
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgbrf',range(0,5)),
        'scale': hp.choice('scale_xgbrf',  range(0,4)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgbrf', [False, "IQR", "MAD"]),
        'preproc_algo': hp.choice('preproc_algo_xgbrf', [{'preproc_algo' : 'PCA', 'whiten':hp.choice('pca_whiten_xgbrf', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}]),
    }
    ,   
    { # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html?highlight=LGBMRegressor <=> LightGBM
        'type': 'lgbm',
        'boosting_type' : hp.choice('boosting_type_lgbm', boosting_list),
        'num_leaves' : hp.choice('num_leaves', range(2, 50)),
        'max_depth': hp.choice('max_depth_lgbm', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_lgbm', np.log(0.005), np.log(0.2)),
        'n_estimators': hp.choice('n_estimators_lgbm', range(30, 150)),
        'subsample_for_bin': hp.choice('subsample_for_bin', range(10000, 30000)),
        'objective' : hp.choice('objective_lgb', objective_list), # default = regression for LGBMRegressor
        'min_split_gain': hp.uniform('min_split_gain_lgbm', 0.1, 10),
        'min_child_weight' : hp.uniform('min_child_weight_lgbm', 0, 5),
        'subsample' : hp.quniform('subsample_lgbm', 0.4, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree_lgbm', 0.1, 1, 0.01),
        'reg_alpha' : hp.uniform('reg_alpha_lgbm', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_lgbm', 0, 5),
        'max_bin': hp.choice('max_bin_lgb', range(20,255)),
        'top_rate': hp.uniform('top_rate', 0, 0.5),
        'other_rate': hp.uniform('other_rate', 0, 0.5),
        'random_state': 42,
     

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_lgb',range(0,5)),
        'scale': hp.choice('scale_lgb', range(0,4)),
        'enrollmentoutlierremover': hp.choice('enrollmentoutlierremover', [False, "IQR", "MAD"]),
        'preproc_algo': hp.choice('preproc_algo_lgm,', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_lgbm', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}])
    }
])}

## Define objective function

In [None]:
def objective(space_params):
    steps = []
    model = space_params['model'].copy()

    t = space_params['model']['type']
    print(f"Running: {t}")  ## print out the model it selects for each run

    # Option steps: timeoutlierremover and enrollment count outlier remover 
    df_train_X_new, df_train_y_new = removeTimeAndEnrollmentOutliers(space_params['model'], df_train_X, df_train_y)

    #Time Series Split for the Validation Set as well
    train_size = int(len(df_train_X_new) * 0.8)
    train_X, val_X = df_train_X_new[0:train_size], df_train_X_new[train_size:len(df_train_X_new)]
    train_y, val_y = df_train_y_new[0:train_size], df_train_y_new[train_size:len(df_train_y_new)]

    #train_X, val_X, train_y, val_y = train_test_split(df_train_X_new, df_train_y_new, test_size=0.2,random_state= 42)

    # Option steps: feature selector or PCA or nothing
    steps, val_X, train_X = get_steps(space_params['model'],  val_X, train_X, train_y)

    del space_params['model']['type']

    #==========
    #RandomForest
    #==========    
    
    if t == 'rf':
        steps.append(('Random Forest', RandomForestRegressor(**(space_params["model"]))))

        # print(f"steps: {steps}")
        # print(f"MAE val: {mae}")

    #==========
    #XGBoost
    #==========
    elif t == 'xgb':
        if space_params['model']['tree_method']['tree_method'] == 'hist':
            # only 'hist' has max_bin 
            max_bin = space_params['model']['tree_method'].get('max_bin')
            space_params['model']['max_bin'] = int(max_bin)
            
            if space_params['model']['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                grow_policy = space_params['model']['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                space_params['model']['grow_policy'] = grow_policy
                space_params['model']['tree_method'] = 'hist'
            else:
                max_leaves = space_params['model']['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                space_params['model']['grow_policy'] = 'lossguide'
                space_params['model']['max_leaves'] = int(max_leaves)
                space_params['model']['tree_method'] = 'hist'
        else:
            space_params['model']['tree_method'] = space_params['model']['tree_method'].get('tree_method')
        


        steps.append(('XGBregressor', xgb.XGBRegressor(**(space_params["model"]))))


        # if booster != 'gblinear':
            # logging.info(f"Important features: {model.feature_importances_}")


    #==========
    #XGB Random Forest Regressor
    #==========
    elif t == 'xgbRF':
        # logging.info(f"Model: {space_params['model']}")
                                 # ,num_parallel_tree=100

        steps.append(('XGBRF',xgb.XGBRFRegressor(**(space_params["model"]))))



    #==========
    #LightGBM
    #==========
    elif t == 'lgbm':
        # logging.info(f"Model: {space_params['model']}")
        """
        top rate and other rate used only in 'goss' 
        constraints: 0.0 <= top_rate + other_rate <= 1.0
        to enable bagging, subsample_freq should be set to a non zero value as well
        to enable bagging, subsample should be set to value smaller than 1.0 as well
        """
        boosting_type = space_params['model']['boosting_type']
                
        if boosting_type  == 'goss':
            # cannot use subsample in goss
            space_params["model"]['subsample_freq'] = 0
        else:
            space_params["model"]['subsample_freq'] = 1

        # sometimes different versions of LGBM have errors with alias names
        steps.append(('LGBMregressor', lgb.LGBMRegressor(**(space_params["model"]))))

    # logging.info(steps)
    print(f"steps: {steps}")    
    pipe = Pipeline(steps)
    pipe.fit(train_X,train_y)
    pred = pipe.predict(val_X)
    mae = mean_absolute_error(val_y, pred)
    print(f"MAE val: {mae}")

    return {
        'loss': mae,
        'status': STATUS_OK,
        # 'eval_time': time.time(),
        'version': __VERSION__,
        'model': model,
        'selectedFeature': list(train_X.columns)
        }

## Find the best model on training set and predict the validation set

In [None]:
# trialsMongoRegression = MongoTrials("mongo://127.0.0.1:27017/hyperopt/jobs", exp_key="regression")

In [None]:
trials = Trials()
best = fmin(objective,
    space = space_params,
    algo = tpe.suggest,
    max_evals = 2,
    trials = trials
    ) # Use trialsMongoRegression to run parallel search

print(best)

Running: rf
time2
92 features with a single unique value.

126 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Feature importances:
                                                feature  importance  \
0                                               MeshID1       134.0   
1                                        #EligiCriteria       119.0   
2                                           AvgLocalAge       115.0   
3                                   min_city_population       110.0   
4                  

## Run on test set

In [None]:
testStep = [('XGBregressor', xgb.XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=0.33,
             colsample_bynode=0.97, colsample_bytree=0.62, eval_metric='mae',
             gamma=2.7008513657625777, importance_type='gain',
             learning_rate=0.08084281564769251, max_delta_step=9, max_depth=38,
             min_child_weight=3.861309819546359, missing=None, n_estimators=138,
             n_jobs=4, nthread=None, objective='reg:tweedie', random_state=0,
             reg_alpha=1.0402613374945036, reg_lambda=0.6403692625068597,
             sampling_method='uniform', scale_pos_weight=1, seed=None,
             silent=None, subsample=0.5, tree_method='approx', verbosity=1))]

pipe = Pipeline(testStep)
pipe.fit(df_train_X,df_train_y)
pred = pipe.predict(df_test_X)
mae = mean_absolute_error(df_test_y, pred)
print(f"MAE val: {mae}")

MAE val: 8.07984074707807


# Regression for target encoding

### Read csv for target encoding


In [None]:
!pip install category_encoders
import category_encoders as ce
target_data = pd.read_csv('pipeline_target_output.csv', sep=";")
target_data = target_data.drop(columns = ['Unnamed: 0','index'])
target_data.head()



### Create holdout set

In [None]:
# # Create Random Split
# X = target_data.copy().drop(columns=['EnrollmentDuration'])
# Y = target_data.copy()['EnrollmentDuration']

# df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(X, Y, test_size=0.1, random_state=42)

In [None]:
#Create Time Series Split

X = target_data.sort_values(by="StartYear").copy()

X=X[X["StartYear"]>1995]
X=X[X["StartYear"]<2018]

print(f"Size of dataset: {len(X)} records")

Y= X["EnrollmentDuration"]
X = X.drop("EnrollmentDuration", axis =1)

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y[0:train_size], Y[train_size:len(Y)]

#Info about the train and test set
print("Train Set Time Range: from ", df_train_X["StartYear"].min(), " to ", df_train_X["StartYear"].max())
print("Test Set Time Range: from ", df_test_X["StartYear"].min(), " to ", df_test_X["StartYear"].max(), "\n")
print("Median Duration Training set: ", df_train_y.median())
print("Median Duration Test set: ", df_test_y.median())

Size of dataset: 4819 records
Train Set Time Range: from  1996.0  to  2005.0
Test Set Time Range: from  2005.0  to  2012.0 

Median Duration Training set:  48.0
Median Duration Test set:  32.0


### Set searching spaces

In [None]:
# set searching space
space_params = {
    'model': hp.choice('model_type', [

    { # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html <=> Random Forest
        'type': 'rf',
        'n_estimators': hp.choice('n_estimators_rf', range(25, 150)),
        'max_depth': hp.choice('max_depth_rf', range(5, 100)),
        'min_samples_split': hp.choice('min_samples_split', range(2, 100)),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 100)),
        'max_leaf_nodes': hp.choice('max_leaf_nodes', range(10, 100)),
        'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0, 0.4, 0.1),
        'n_jobs': -1 ,  #-1 means using all processors,
        'random_state': 42,

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_rf', range(0,5)),
        'scale': hp.choice('scale_rf', range(0,4)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_rf', [False, "IQR", "MAD"]),
        'preproc_algo' :  hp.choice('preproc_algo_rf', [{'preproc_algo' : 'PCA', 'whiten':hp.choice('pca_whiten_rf', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}])
    }
    ,

    {   # https://xgboost.readthedocs.io/en/latest/python/python_api.html <=> XGBregressor
        'type': 'xgb',
        'n_estimators': hp.choice('n_estimators_xgb', range(1, 200)),
        'booster' : hp.choice('boosting_xgb', boosting_list_xgb),
        'learning_rate' : hp.loguniform('learning_rate_xgb', np.log(0.005), np.log(0.2)),
        'tree_method' : hp.choice('tree_method_xgb', tree_method),
        'gamma' : hp.uniform('gamma', 0.1, 3),
        'max_depth': hp.choice('max_depth_xgb', range(1, 40)),
        'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
        'max_delta_step': hp.choice('max_delta_step', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgb', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results
        'sampling_method': hp.choice('sampling_method', sampling_method_xgb),

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
        
        'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
        'random_state': 42,

        'objective' : hp.choice('objective', objective_list_xgb),
        'n_jobs': 4,
        'eval_metric' : 'mae',
     
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgb', range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgb', [False, "IQR", "MAD"]),         
        'preproc_algo': hp.choice('preproc_algo_xgb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_xgb', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}]),
        'scale': hp.choice('scale_xgb',  range(0,4))
    }
    ,
    {   # https://xgboost.readthedocs.io/en/latest/python/python_api.html <=> XGBRFregressor
        'type': 'xgbRF',
        'max_depth': hp.choice('max_depth_xgbrf', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_xgbrf', np.log(0.005), np.log(0.2)),
        #'booster' : hp.choice('boosting_xgbrf', boosting_list_xgb),
        'tree_method' : hp.choice('tree_method_xgbrf', tree_method_xgb),
        'gamma' : hp.uniform('gamma_xgbrf', 0.1, 3),
        'min_child_weight' : hp.uniform('min_child_weight_xgbrf', 0, 5),
        'max_delta_step': hp.choice('max_delta_step_xgbrf', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgbrf', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree_xgbrf', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode_xgbrf', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel_xgbrf', 0.1, 1, 0.01),

        'reg_alpha' : hp.uniform('reg_alpha_xgbrf', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_xgbrf', 0, 5),
        'random_state': 42,
        'objective' : hp.choice('objective_xgbrf', objective_list_xgb),
        'n_jobs': 4,
        'eval_metric' : 'mae',
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgbrf', range(0,5)),
        'scale': hp.choice('scale_xgbrf',  range(0,4)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgbrf', [False, "IQR", "MAD"]),
        'preproc_algo': hp.choice('preproc_algo_xgbrf', [{'preproc_algo' : 'PCA', 'whiten':hp.choice('pca_whiten_xgbrf', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}]),
    }
    ,   
    { # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html?highlight=LGBMRegressor <=> LightGBM
        'type': 'lgbm',
        'boosting_type' : hp.choice('boosting_type_lgbm', boosting_list),
        'num_leaves' : hp.choice('num_leaves', range(2, 50)),
        'max_depth': hp.choice('max_depth_lgbm', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_lgbm', np.log(0.005), np.log(0.2)),
        'n_estimators': hp.choice('n_estimators_lgbm', range(30, 150)),
        'subsample_for_bin': hp.choice('subsample_for_bin', range(10000, 30000)),
        'objective' : hp.choice('objective_lgb', objective_list), # default = regression for LGBMRegressor
        'min_split_gain': hp.uniform('min_split_gain_lgbm', 0.1, 10),
        'min_child_weight' : hp.uniform('min_child_weight_lgbm', 0, 5),
        'subsample' : hp.quniform('subsample_lgbm', 0.4, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree_lgbm', 0.1, 1, 0.01),
        'reg_alpha' : hp.uniform('reg_alpha_lgbm', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_lgbm', 0, 5),
        'max_bin': hp.choice('max_bin_lgb', range(20,255)),
        'top_rate': hp.uniform('top_rate', 0, 0.5),
        'other_rate': hp.uniform('other_rate', 0, 0.5),
        'random_state': 42,
        'encoder': hp.choice('encoder',['target','categorical_encoding']),
     

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_lgb', range(0,5)),
        'scale': hp.choice('scale_lgb', range(0,4)),
        'enrollmentoutlierremover': hp.choice('enrollmentoutlierremover', [False, "IQR", "MAD"]),
        'preproc_algo': hp.choice('preproc_algo_lgm,', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_lgbm', whitens) },
                                                     {'preproc_algo':'feature_selector'},
                                                     {'preproc_algo':'no_preproc'}])
    }
])}

### Define Objective function

In [None]:
def objective(space_params):
    steps = []
    model1 = space_params['model'].copy()

    t = space_params['model']['type']
    print(f"Running: {t}")  ## print out the model it selects for each run

    # Option steps: timeoutlierremover and enrollment count outlier remover 
    df_train_X_new, df_train_y_new = removeTimeAndEnrollmentOutliers(space_params['model'], df_train_X, df_train_y)

    #Time Series Split for the Validation Set as well
    train_size = int(len(df_train_X_new) * 0.8)
    train_X, val_X = df_train_X_new[0:train_size], df_train_X_new[train_size:len(df_train_X_new)]
    train_y, val_y = df_train_y_new[0:train_size], df_train_y_new[train_size:len(df_train_y_new)]
    
    #train_X, val_X, train_y, val_y = train_test_split(df_train_X_new, df_train_y_new, test_size=0.2,random_state= 42)

    """
     Do target encoder for whole data
    # https://github.com/scikit-learn-contrib/category_encoders
    # https://brendanhasz.github.io/2019/03/04/target-encoding 
    Target data encoder was put beforehand so the model can further 
    process numerical value and thus do feature selection, PCA, or scaling.
    Since target encoders did not change how the original categorical label functions 
    in categorical encoding, it is fine to put target encoding before categorical encoding.
    Categorical encoding can then change numerical values back to categorical values.
    -
    Not possible to use steps for categorical_encoding.
    Cannot directly apply steps here, since this model have to directly use predict
    without fit the data first.
    The new settings allow this model to run without using steps.
    """
    encoder = ce.TargetEncoder()
    train_X = encoder.fit_transform(train_X, train_y)
    val_X = encoder.transform(val_X)

    # Option steps: feature selector or PCA or nothing

    if t == 'lgbm':
        if space_params['model']['encoder'] == 'categorical_encoding':
            val_X, train_X = feature_selector_cat(space_params['model'], val_X, train_X, train_y)
        else:
            steps, val_X, train_X = get_steps(space_params['model'], val_X, train_X, train_y)
    else:
        steps, val_X, train_X = get_steps(space_params['model'], val_X, train_X, train_y)
    del space_params['model']['type']

    #==========
    #RandomForest
    #==========    
    
    if t == 'rf':
        steps.append(('Random Forest', RandomForestRegressor(**(space_params["model"]))))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)
        # print(f"steps: {steps}")
        # print(f"MAE val: {mae}")

    #==========
    #XGBoost
    #==========
    elif t == 'xgb':
        if space_params['model']['tree_method']['tree_method'] == 'hist':
            print('see maxbin',space_params['model'])
            max_bin = space_params['model']['tree_method'].get('max_bin')
            space_params['model']['max_bin'] = int(max_bin)
            
            if space_params['model']['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                grow_policy = space_params['model']['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                space_params['model']['grow_policy'] = grow_policy
                space_params['model']['tree_method'] = 'hist'
            else:
                max_leaves = space_params['model']['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                space_params['model']['grow_policy'] = 'lossguide'
                space_params['model']['max_leaves'] = int(max_leaves)
                space_params['model']['tree_method'] = 'hist'
        else:
            space_params['model']['tree_method'] = space_params['model']['tree_method'].get('tree_method')
        


        steps.append(('XGBregressor', xgb.XGBRegressor(**(space_params["model"]))))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)

  

    #==========
    #XGB Random Forest Regressor
    #==========
    elif t == 'xgbRF':
        steps.append(('XGBRF',xgb.XGBRFRegressor(**(space_params["model"]))))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)



    #==========
    #LightGBM
    #==========
    elif t == 'lgbm':
        """
        top rate and other rate used only in 'goss' 
        constraints: 0.0 <= top_rate + other_rate <= 1.0
        to enable bagging, subsample_freq should be set to a non zero value as well
        to enable bagging, subsample should be set to value smaller than 1.0 as well
        """
        
        boosting_type = space_params['model']['boosting_type']      
        if boosting_type  == 'goss':
            # cannot use subsample in goss
            space_params["model"]['subsample_freq'] = 0
        else:
            space_params["model"]['subsample_freq'] = 1
        
        if space_params['model']['encoder'] == 'categorical_encoding':
            categorical_feats = ['Condition1','Condition2','HealthyVolunteers', 'Gender', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'DesignPrimaryPurpose', 'EnrollmentType', 'OrgClass','DesignAllocation','DesignInterventionModel']
            categorical_feats_ = categorical_feats.copy()
            for i in categorical_feats_:
                if i not in train_X.columns:
                    categorical_feats.remove(i)   
                    
            if categorical_feats != []:
                for c in categorical_feats:
                    train_X[c] = train_X[c].astype('category')
                    val_X[c] = val_X[c].astype('category')

            
            train_X_new = lgb.Dataset(train_X, train_y, categorical_feature = categorical_feats)
            val_X_new = lgb.Dataset(val_X, val_y , reference = train_X_new)
            

            model = lgb.train( space_params["model"]
                            ,train_X_new, num_boost_round=20,
                            valid_sets = val_X_new,
                            early_stopping_rounds=7)
            
            
            pred = model.predict(val_X)
            mae = mean_absolute_error(val_y, pred)

        else:


            #sometimes different versions of LGBM have errors with naming, try bagging_freq
            steps.append(('LGBMregressor', lgb.LGBMRegressor(**(space_params["model"]))))
            pipe = Pipeline(steps)
            pipe.fit(train_X,train_y)
            # predict: apply transforms to the data, and predict with the final estimator
            pred = pipe.predict(val_X)
            mae = mean_absolute_error(val_y, pred)


    return {
        'loss': mae,
        'status': STATUS_OK,
        'selectedFeatures': list(train_X.columns),
        'version': __VERSION__,
        'model': model1
        }


### Find best model

In [None]:
trials = Trials()
best = fmin(objective,
    space = space_params,
    algo = tpe.suggest,
    max_evals = 10,
    trials = trials)

print(best)

Running: lgbm
time1
37 features with a single unique value.

78 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Fit LGBM Model with best params
Done fitting
Feature importances:
                                                feature  importance  \
0                                        #EligiCriteria       664.0   
1                                            Condition1       521.0   
2                                   avg_city_population       469.0   
3                             #Pts/#DiffLocationCountry       427.0   
4                 

# Classification

In [None]:
#Drop trials before 1995 and after 2018
X = new_data.sort_values(by="StartYear").copy()

X=X[X["StartYear"]>1995]
X=X[X["StartYear"]<2018]

#Create X and Y
Y = X[['EnrollmentDuration']].copy()
X = X.copy().drop(columns=['EnrollmentDuration'])

print(f"Size of dataset: {len(X)} records")

Size of dataset: 4819 records


## Label Encoder

In [None]:
class LabelEncoder( BaseEstimator, TransformerMixin ):
  """
  Strategy:
      equal_interval - same bin size (needs frequency=interval size)
      equal_frequency - same number of occurrences in each bin (needs n_bins and labels)
  """
  def __init__(self, strategy='equal_frequency', n_bins = 5, labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long'], frequency = 12):
    self.strategy = strategy
    self.n_bins = n_bins
    self.labels = labels
    self.frequency = frequency
    self.results = []

  def fit(self, X):
    if self.strategy == 'equal_interval':
      interval_range = pd.interval_range(start=0, freq=self.frequency, end=int(X.max())+self.frequency)
      self.groups, bin_edges = pd.cut(X[X.columns[0]], bins=interval_range, retbins=True, labels=False)
      self.results_table = pd.DataFrame(zip(bin_edges, range(0,len(interval_range))), columns=['Group', 'Label'])
      code = []
      for index, row in self.results_table.iterrows():
        # print(row['Group'].mid)
        code.append(int(row['Group'].mid))
        # print(row['Code'])
      self.results_table['Code'] = code
      display(self.results_table)
      for value in X[X.columns[0]]:
          for index, data in self.results_table.iterrows():
              if value in data.Group: self.results.append(data.Code)
      
    else:
      self.groups, bin_edges = pd.qcut(X[X.columns[0]], q=self.n_bins, labels=False, retbins=True)
      code = []
      for i in range(len(bin_edges)):
        if i == 0:
          continue
        else:
          median_value = round(bin_edges[i]+bin_edges[i-1])/2
          code.append(int(median_value))
      self.results_table = pd.DataFrame(zip(bin_edges, self.labels, code), columns=['Start Value', 'Label', 'Code'])
      for value in X[X.columns[0]]:
        for index, row in self.results_table.iterrows():
          if value < row['Start Value']:
            self.results.append(self.results_table.at[index-1, 'Code'])
            break
        else:
          self.results.append(self.results_table['Code'][6])
      display(self.results_table)
    return self

  def transform(self, X):
    df_test = X.copy()
    df_test['new'] = self.results
    display(df_test)
    X_new = pd.DataFrame(self.results, index=X.index, columns=X.columns)
    return X_new

In [None]:
label_transformer = LabelEncoder(strategy="equal_frequency", n_bins=7, labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
#label_transformer = LabelEncoder(strategy="equal_interval", frequency = 12)
Y_new = label_transformer.fit_transform(Y)
Y_new.head()

Unnamed: 0,Start Value,Label,Code
0,1.0,A,9
1,17.0,B,22
2,28.0,C,34
3,40.0,D,47
4,55.0,E,64
5,73.0,F,87
6,102.0,G,175


Unnamed: 0,EnrollmentDuration,new
293,163,175
292,106,175
296,169,175
290,44,47
299,134,175
...,...,...
3057,28,34
4369,22,22
2916,80,87
3631,45,47


Unnamed: 0,EnrollmentDuration
293,175
292,175
296,175
290,47
299,175


## Create holdout set

In [None]:
# #holdout set
#df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(X, Y_new, test_size=0.1, random_state=42)

In [None]:
#create time series hold out set

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y_new[0:train_size], Y_new[train_size:len(Y_new)]

#Info about the train and test set
print("Train Set Time Range: from ", df_train_X["StartYear"].min(), " to ", df_train_X["StartYear"].max())
print("Test Set Time Range: from ", df_test_X["StartYear"].min(), " to ", df_test_X["StartYear"].max(), "\n")
print("Median Duration Training set: ", df_train_y.median())
print("Median Duration Test set: ", df_test_y.median())

Train Set Time Range: from  1996.0  to  2005.0
Test Set Time Range: from  2005.0  to  2012.0 

Median Duration Training set:  EnrollmentDuration    47.0
dtype: float64
Median Duration Test set:  EnrollmentDuration    34.0
dtype: float64


## Some parameters for searching space

In [None]:
whitens = [False, True]
# preprocessing xgboost
boosting_list_xgb = ['gbtree', 'dart', 'gblinear']
sampling_method_xgb = ["uniform", "gradient_based"]
grow_policy_xgb = ["depthwise", "lossguide"]
objective_list_xgb = ['reg:linear', 'reg:gamma', 'reg:tweedie'] # for linear only
tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                'grow_policy' : {'grow_policy':'lossguide' ,
                                  'max_leaves': hp.quniform('max_leaves', 32, 100, 1)}}}]

# Available parameter for LightGBM
boosting_list = ['dart', 'goss', 'gbdt'] #'rf', 
objective_list = ['huber', 'gamma', 'fair', 'tweedie']
LGBM_MAX_DEPTH = 25
EVAL_METRIC_LGBM_REG = 'mae'

## Set Searching Spaces

In [None]:
space = hp.choice('classifier_type', [
    {
        'type': 'naive_bayes',
        'alpha': hp.uniform('alpha', 0.0, 2.0),
        
        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_nb',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_nb', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_nb', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_nb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_nb', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
    , 
    {
        'type': 'svm',
        'C': hp.uniform('C', 0.01, 10.0),
        ## linear is super super slow
        # 'kernel': ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’,
        'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), # , 'precomputed' => cannot be used https://stackoverflow.com/questions/36306555/scikit-learn-grid-search-with-svm-regression
        'degree': hp.choice('degree_poly', range(2, 5)),
        'gamma': hp.choice('gamma', ['scale', 'auto']), # 0, 20.0 => test a simplified way
        'coef0': hp.uniform('coef0', 0, 5),
        'class_weight': hp.choice('class_weight_svm', ['balanced', None]),

        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_svm',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_svm', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_svm', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_svm', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_svm', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
     }
    ,
    {
        'type': 'randomforest',
        'n_estimators': hp.choice('n_estimators_rf', range(50, 150)),
        'criterion': hp.choice('criterion_rf', ["gini", "entropy"]),
        'max_depth': hp.choice('max_depth_rf', range(1,50)),
        'min_samples_split': hp.choice('min_samples_split_rf', range(2, 100)),
        'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 100)),
        'max_features': hp.choice('max_features_rf', range(1,10)),
        'max_leaf_nodes': hp.choice('max_leaf_nodes_rf', range(10, 100)),
        'min_impurity_decrease': hp.quniform('min_impurity_decrease_rf', 0, 0.4, 0.1),
     
        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_rf',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_rf', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_rf', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_rf', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_rf', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
    # ,
    # {   # Curse of Dimensionality => Nearest neighbor classifiers are no longer meaningful (Data mining 2 lecture)
    #     'type': 'knn',
    #     'n_neighbors': hp.choice('knn_n_neighbors', range(2,30)),
    #     'leaf_size':  hp.choice('knn_leaf_size', range(30,50)),
    #     'timeoutlierremover' : hp.choice('knn_timeoutlierremover',[0,1])
    # }
    ,
    { 
        # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier <=> XGBClassifier
        'type': 'xgb',
        'n_estimators': hp.choice('n_estimators_xgb', range(1, 200)),
        'booster' : hp.choice('boosting_xgb', boosting_list_xgb),
        'learning_rate' : hp.loguniform('learning_rate_xgb', np.log(0.005), np.log(0.2)),
        'tree_method' : hp.choice('tree_method_xgb', tree_method),
        'gamma' : hp.uniform('gamma_xgb', 0.1, 3),
        'max_depth': hp.choice('max_depth_xgb', range(1, 40)),
        'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
        'max_delta_step': hp.choice('max_delta_step', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgb', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results
        'sampling_method': hp.choice('sampling_method', sampling_method_xgb),

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
        
        'reg_alpha' : hp.uniform('reg_alpha_xgb', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_xgb', 0, 5),

        'objective' : hp.choice('objective', objective_list_xgb),
     
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgb',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgb', [False, "IQR", "MAD"]),         
        'preproc_algo': hp.choice('preproc_algo_xgb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_xgb', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}]),
        'scale': hp.choice('scale_xgb', range(0,4))
    }
    ,
    {   # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html <=> LightGBM
        # https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html 
        'type': 'lgbm',
        'boosting_type' : hp.choice('boosting_type_lgbm', boosting_list),
        'num_leaves' : hp.choice('num_leaves', range(2, 50)),
        'max_depth': hp.choice('max_depth_lgbm', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_lgbm', np.log(0.005), np.log(0.2)),
        'n_estimators': hp.choice('n_estimators_lgbm', range(30, 150)),
        'subsample_for_bin': hp.choice('subsample_for_bin', range(10000, 30000)),
        'class_weight': hp.choice ('class_weight_lgbm', ['balanced',None]),
        'objective' : hp.choice('objective_lgb', objective_list), # default = regression for LGBMRegressor
        'min_split_gain': hp.uniform('min_split_gain_lgbm', 0.1, 10),
        'min_child_weight' : hp.uniform('min_child_weight_lgbm', 0, 5),
        'subsample' : hp.quniform('subsample_lgbm', 0.4, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree_lgbm', 0.1, 1, 0.01),
        'reg_alpha' : hp.uniform('reg_alpha_lgbm', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_lgbm', 0, 5),
        'max_bin': hp.choice('max_bin_lgb', range(20,255)),
        'top_rate': hp.uniform('top_rate', 0, 0.5),
        'other_rate': hp.uniform('other_rate', 0, 0.5),

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_lgb',range(0,5)),       
        'enrollmentoutlierremover': hp.choice('enrollmentoutlierremover', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_lgb', range(0,4)),
        'preproc_algo': hp.choice('preproc_algo_lgm,', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_lgbm', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
])

## Define Objective Function and create weighted confusion matrix

In [None]:
def objective_classifier(params):
    print(params)
    steps = []
    model = params.copy()

    t = params['type']
    print(f"Running: {t}")  ## print out the model it selects for each run

    # Option steps: timeoutlierremover and enrollment count outlier remover 
    df_train_X_new, df_train_y_new = removeTimeAndEnrollmentOutliers(params, df_train_X, df_train_y)

    train_size = int(len(df_train_X_new) * 0.8)
    train_X, val_X = df_train_X_new[0:train_size], df_train_X_new[train_size:len(df_train_X_new)]
    train_y, val_y = df_train_y_new[0:train_size], df_train_y_new[train_size:len(df_train_y_new)]

    # Option steps: feature selector or PCA or nothing
    steps, val_X, train_X = get_steps(params, val_X, train_X, train_y)
    

    del params['type']

    #==========
    # Naive Bayes - BernoulliNB
    #==========  

    if t == 'naive_bayes':
        steps.append(('naive_bayes', BernoulliNB(**params)))

    #==========
    # SVM
    #==========   
    elif t == 'svm':
        steps.append(('svm', SVC(**params, random_state=42)))

    #==========
    # KNN
    #==========  
    # elif t == 'knn':    
    #     clf = KNeighborsClassifier(**params)

    #==========
    # Random forest
    #==========  
    elif t == 'randomforest':
        steps.append(('dt', RandomForestClassifier(**params, random_state=42)))
    
    #==========
    # XGBClassifier
    #========== 
    elif t == 'xgb':
        if params['tree_method']['tree_method'] == 'hist':
            print('see maxbin',params)
            max_bin = params['tree_method'].get('max_bin')
            params['max_bin'] = int(max_bin)
            
            if params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                grow_policy = params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                params['grow_policy'] = grow_policy
                params['tree_method'] = 'hist'
            else:
                max_leaves = params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                params['grow_policy'] = 'lossguide'
                params['max_leaves'] = int(max_leaves)
                params['tree_method'] = 'hist'
        else:
            params['tree_method'] = params['tree_method'].get('tree_method')

        steps.append(('xgb', xgb.XGBClassifier(**params, random_state=42)))
    
    #==========
    # LGBMClassifier
    #========== 
    elif t == 'lgbm':
        """
        top rate and other rate used only in 'goss' 
        constraints: 0.0 <= top_rate + other_rate <= 1.0
        to enable bagging, subsample_freq should be set to a non zero value as well
        to enable bagging, subsample should be set to value smaller than 1.0 as well
        """
        boosting_type = params['boosting_type']
                
        if boosting_type  == 'goss':
            # cannot use subsample in goss
            params['subsample_freq'] = 0
        else:
            params['subsample_freq'] = 1

        steps.append(('LGBM', lgb.LGBMClassifier(**params, random_state=42)))
    else:
        return 'error'

    # return cross_val_score(clf, train_X, train_y, scoring='f1_weighted', cv = 5).mean()
    print(f"steps: {steps}")
    pipe = Pipeline(steps)
    pipe.fit(train_X,train_y)
    pred = pipe.predict(val_X)
    mae = mean_absolute_error(val_y, pred)
    print(mae)
    # f1 = cross_val_score(pipe, train_X, train_y, scoring='f1_weighted', cv = 5).mean()

    return {
        'loss': mae,
        'status': STATUS_OK,
        'version': __VERSION__,
        'selectedFeatures': list(train_X.columns),
        'model': model,
        'pred': pred
        }

## Find the best model on training set and predict the validation set

In [None]:
# trialsMongoClassification = MongoTrials("mongo://127.0.0.1:27017/hyperopt/jobs", exp_key="classification")

In [None]:
trials = Trials()

best = fmin(objective_classifier,                          
    space = space,                        
    algo = tpe.suggest,
    max_evals = 10,
    trials = trials)

print('Best:')
print(best)

{'criterion': 'entropy', 'enrollmentoutlierremover': 'MAD', 'max_depth': 48, 'max_features': 6, 'max_leaf_nodes': 45, 'min_impurity_decrease': 0.2, 'min_samples_leaf': 98, 'min_samples_split': 53, 'n_estimators': 59, 'preproc_algo': {'preproc_algo': 'no_preproc'}, 'scale': 2, 'timeoutlierremover': 1, 'type': 'randomforest'}
Running: randomforest
time1
count
steps: [('MinMaxScaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('dt', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=48, max_features=6,
                       max_leaf_nodes=45, max_samples=None,
                       min_impurity_decrease=0.2, min_impurity_split=None,
                       min_samples_leaf=98, min_samples_split=53,
                       min_weight_fraction_leaf=0.0, n_estimators=59,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False))]
53.69253731343284


# Classification for target encoding

### Read csv for target encoding

In [None]:
!pip install category_encoders
import category_encoders as ce
target_data = pd.read_csv('pipeline_target_output.csv', sep=";")
target_data = target_data.drop(columns = ['Unnamed: 0','index'])
target_data.head()



### Create holdout set

In [None]:
label_transformer = LabelEncoder(strategy="equal_frequency", n_bins=7, labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
# label_transformer = LabelEncoder(strategy="equal_interval", frequency = 12)
# target_data = target_data.head(3000) 

#Sort by StartYear
target_data = target_data.sort_values(by="StartYear").copy()

#Exclude trials before 1995 and after 2018
target_data=target_data[target_data["StartYear"]>1995]
target_data=target_data[target_data["StartYear"]<2018]

#Split into X and Y
Y = target_data[['EnrollmentDuration']].copy()
X = target_data.copy().drop(columns=['EnrollmentDuration'])

Y_new = label_transformer.fit_transform(Y)
Y_new.head()

print(f"Size of dataset: {len(X)} records")

Unnamed: 0,Start Value,Label,Code
0,1.0,A,9
1,17.0,B,22
2,28.0,C,34
3,40.0,D,47
4,55.0,E,64
5,73.0,F,87
6,102.0,G,175


Unnamed: 0,EnrollmentDuration,new
293,163,175
292,106,175
296,169,175
290,44,47
299,134,175
...,...,...
3057,28,34
4369,22,22
2916,80,87
3631,45,47


Size of dataset: 4819 records


In [None]:
#Random Split
#df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(X, Y_new, test_size=0.1, random_state=42)

In [None]:
#Time Series Split

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y_new[0:train_size], Y_new[train_size:len(Y_new)]

#Info about the train and test set
print("Train Set Time Range: from ", df_train_X["StartYear"].min(), " to ", df_train_X["StartYear"].max())
print("Test Set Time Range: from ", df_test_X["StartYear"].min(), " to ", df_test_X["StartYear"].max(), "\n")
print("Median Duration Training set: ", df_train_y.median())
print("Median Duration Test set: ", df_test_y.median())

Train Set Time Range: from  1996.0  to  2005.0
Test Set Time Range: from  2005.0  to  2012.0 

Median Duration Training set:  EnrollmentDuration    47.0
dtype: float64
Median Duration Test set:  EnrollmentDuration    34.0
dtype: float64


### Searching spaces

In [None]:
space = hp.choice('classifier_type', [
    {
        'type': 'naive_bayes',
        'alpha': hp.uniform('alpha', 0.0, 2.0),
        
        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_nb',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_nb', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_nb', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_nb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_nb', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
    , 
    {
        'type': 'svm',
        'C': hp.uniform('C', 0.01, 10.0),
        ## linear is super super slow
        # 'kernel': ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’,
        'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), # , 'precomputed' => cannot be used https://stackoverflow.com/questions/36306555/scikit-learn-grid-search-with-svm-regression
        'degree': hp.choice('degree_poly', range(2, 5)),
        'gamma': hp.choice('gamma', ['scale', 'auto']), # 0, 20.0 => test a simplified way
        'coef0': hp.uniform('coef0', 0, 5),
        'class_weight': hp.choice('class_weight_svm', ['balanced', None]),

        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_svm',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_svm', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_svm', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_svm', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_svm', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
     }
    ,
    {
        'type': 'randomforest',
        'n_estimators': hp.choice('n_estimators_rf', range(50, 150)),
        'criterion': hp.choice('criterion_rf', ["gini", "entropy"]),
        'max_depth': hp.choice('max_depth_rf', range(1,50)),
        'min_samples_split': hp.choice('min_samples_split_rf', range(2, 100)),
        'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 100)),
        'max_features': hp.choice('max_features_rf', range(1,10)),
        'max_leaf_nodes': hp.choice('max_leaf_nodes_rf', range(10, 100)),
        'min_impurity_decrease': hp.quniform('min_impurity_decrease_rf', 0, 0.4, 0.1),
     
        # custom transformer
        'timeoutlierremover' : hp.choice('timeoutlierremover_rf',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_rf', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_rf', range(0,4)),
        'preproc_algo' :  hp.choice('preproc_algo_rf', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_rf', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
    # ,
    # {   # Curse of Dimensionality => Nearest neighbor classifiers are no longer meaningful (Data mining 2 lecture)
    #     'type': 'knn',
    #     'n_neighbors': hp.choice('knn_n_neighbors', range(2,30)),
    #     'leaf_size':  hp.choice('knn_leaf_size', range(30,50)),
    #     'timeoutlierremover' : hp.choice('knn_timeoutlierremover',[0,1])
    # }
    ,
    { 
        # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier <=> XGBClassifier
        'type': 'xgb',
        'n_estimators': hp.choice('n_estimators_xgb', range(1, 200)),
        'booster' : hp.choice('boosting_xgb', boosting_list_xgb),
        'learning_rate' : hp.loguniform('learning_rate_xgb', np.log(0.005), np.log(0.2)),
        'tree_method' : hp.choice('tree_method_xgb', tree_method),
        'gamma' : hp.uniform('gamma_xgb', 0.1, 3),
        'max_depth': hp.choice('max_depth_xgb', range(1, 40)),
        'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
        'max_delta_step': hp.choice('max_delta_step', range(1, 10)),
        'subsample' : hp.quniform('subsample_xgb', 0.4, 0.8, 0.05), # must be set to a value less than 1 to enable random selection of training cases (rows). Typically set >= 0.5 with sampling_method = uniform for good results
        'sampling_method': hp.choice('sampling_method', sampling_method_xgb),

        # One of colsample_by* parameters must be set to a value less than 1 to enable random selection of columns
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
        'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
        'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
        
        'reg_alpha' : hp.uniform('reg_alpha_xgb', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_xgb', 0, 5),

        'objective' : hp.choice('objective', objective_list_xgb),
     
        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_xgb',range(0,5)),
        'enrollmentoutlierremover' : hp.choice ('enrollmentoutlierremover_xgb', [False, "IQR", "MAD"]),         
        'preproc_algo': hp.choice('preproc_algo_xgb', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_xgb', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}]),
        'scale': hp.choice('scale_xgb', range(0,4))
    }
    ,
    {   # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html <=> LightGBM
        # https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html 
        'type': 'lgbm',
        'boosting_type' : hp.choice('boosting_type_lgbm', boosting_list),
        'num_leaves' : hp.choice('num_leaves', range(2, 50)),
        'max_depth': hp.choice('max_depth_lgbm', range(1, 100)),
        'learning_rate' : hp.loguniform('learning_rate_lgbm', np.log(0.005), np.log(0.2)),
        'n_estimators': hp.choice('n_estimators_lgbm', range(30, 150)),
        'subsample_for_bin': hp.choice('subsample_for_bin', range(10000, 30000)),
        'class_weight': hp.choice ('class_weight_lgbm', ['balanced',None]),
        'objective' : hp.choice('objective_lgb', objective_list), # default = regression for LGBMRegressor
        'min_split_gain': hp.uniform('min_split_gain_lgbm', 0.1, 10),
        'min_child_weight' : hp.uniform('min_child_weight_lgbm', 0, 5),
        'subsample' : hp.quniform('subsample_lgbm', 0.4, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree_lgbm', 0.1, 1, 0.01),
        'reg_alpha' : hp.uniform('reg_alpha_lgbm', 0, 5),
        'reg_lambda' : hp.uniform('reg_lambda_lgbm', 0, 5),
        'max_bin': hp.choice('max_bin_lgb', range(20,255)),
        'top_rate': hp.uniform('top_rate', 0, 0.5),
        'other_rate': hp.uniform('other_rate', 0, 0.5),
        'encoder': hp.choice('encoder',['target','categorical_encoding']),

        # Custom parameters
        'timeoutlierremover' : hp.choice('timeoutlierremover_lgb',range(0,5)),       
        'enrollmentoutlierremover': hp.choice('enrollmentoutlierremover', [False, "IQR", "MAD"]),
        'scale': hp.choice('scale_lgb', range(0,4)),
        'preproc_algo': hp.choice('preproc_algo_lgm,', [{'preproc_algo' : 'PCA','whiten':hp.choice('pca_whiten_lgbm', whitens) }, 
                                                        {'preproc_algo':'feature_selector'},
                                                        {'preproc_algo':'no_preproc'}])
    }
])

### Define objective function

In [None]:
def objective_classifier(params):
    print(params)
    steps = []
    model1 = params.copy()

    t = params['type']
    print(f"Running: {t}")  ## print out the model it selects for each run

    # Option steps: timeoutlierremover and enrollment count outlier remover 
    df_train_X_new, df_train_y_new = removeTimeAndEnrollmentOutliers(params, df_train_X, df_train_y)

    #Time Series Split
    train_size = int(len(df_train_X_new) * 0.8)
    train_X, val_X = df_train_X_new[0:train_size], df_train_X_new[train_size:len(df_train_X_new)]
    train_y, val_y = df_train_y_new[0:train_size], df_train_y_new[train_size:len(df_train_y_new)]
    
    """
     Do target encoder for whole data
    # https://github.com/scikit-learn-contrib/category_encoders
    # https://brendanhasz.github.io/2019/03/04/target-encoding 
    Target data encoder was put beforehand so the model can further 
    process numerical value and thus do feature selection, PCA, or scaling.
    Since target encoders did not change how the original categorical label functions 
    in categorical encoding, it is fine to put target encoding before categorical encoding.
    Categorical encoding can then change numerical values back to categorical values.
    -
    Not possible to use steps for categorical_encoding.
    Cannot directly apply steps here, since this model have to directly use predict
    without fit the data first.
    The new settings allow this model to run without using steps.
    """
    encoder = ce.TargetEncoder()
    train_X = encoder.fit_transform(train_X, train_y)
    val_X = encoder.transform(val_X)

    if t == 'lgbm':
        if params['encoder'] == 'categorical_encoding':
            val_X, train_X = feature_selector_cat(params, val_X, train_X, train_y)
        else:
            # Option steps: feature selector or PCA or nothing
            steps, val_X, train_X = get_steps(params, val_X, train_X, train_y)
    else:
        steps, val_X, train_X = get_steps(params, val_X, train_X, train_y)
    
    

    del params['type']

    #==========
    # Naive Bayes - BernoulliNB
    #==========  

    if t == 'naive_bayes':
        steps.append(('naive_bayes', BernoulliNB(**params)))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)

    #==========
    # SVM
    #==========   
    elif t == 'svm':
        steps.append(('svm', SVC(**params, random_state=42)))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)



    #==========
    # Random forest
    #==========  
    elif t == 'randomforest':
        steps.append(('dt', RandomForestClassifier(**params, random_state=42)))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)
    
    #==========
    # XGBClassifier
    #========== 
    elif t == 'xgb':
        if params['tree_method']['tree_method'] == 'hist':
            print('see maxbin',params)
            max_bin = params['tree_method'].get('max_bin')
            params['max_bin'] = int(max_bin)
            
            if params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                grow_policy = params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                params['grow_policy'] = grow_policy
                params['tree_method'] = 'hist'
            else:
                max_leaves = params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                params['grow_policy'] = 'lossguide'
                params['max_leaves'] = int(max_leaves)
                params['tree_method'] = 'hist'
        else:
            params['tree_method'] = params['tree_method'].get('tree_method')

        steps.append(('xgb', xgb.XGBClassifier(**params, random_state=42)))
        pipe = Pipeline(steps)
        pipe.fit(train_X,train_y)
        pred = pipe.predict(val_X)
        mae = mean_absolute_error(val_y, pred)
    #==========
    # LGBMClassifier
    #========== 
    elif t == 'lgbm':
        """
        top rate and other rate used only in 'goss' 
        constraints: 0.0 <= top_rate + other_rate <= 1.0
        to enable bagging, subsample_freq should be set to a non zero value as well
        to enable bagging, subsample should be set to value smaller than 1.0 as well
        """
        boosting_type = params['boosting_type']      
        if boosting_type  == 'goss':
            # cannot use subsample in goss
            params['subsample_freq'] = 0
        else:
            params['subsample_freq'] = 1

        if params['encoder'] == 'categorical_encoding':
            
            categorical_feats = ['Condition1','Condition2','HealthyVolunteers', 'Gender', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'DesignPrimaryPurpose', 'EnrollmentType', 'OrgClass','DesignAllocation','DesignInterventionModel']
            categorical_feats_ = categorical_feats.copy()
            for i in categorical_feats_:
                if i not in train_X.columns:
                    categorical_feats.remove(i)   
                    
            if categorical_feats != []:
                for c in categorical_feats:
                    train_X[c] = train_X[c].astype('category')
                    val_X[c] = val_X[c].astype('category')

            ### Not possible to use steps
            train_X_new = lgb.Dataset(train_X, train_y, categorical_feature = categorical_feats)
            val_X_new = lgb.Dataset(val_X, val_y , reference = train_X_new)
            # print('Starting training...')

            #train
            model = lgb.train( params
                            ,train_X_new, num_boost_round=20,
                            valid_sets = val_X_new,
                            early_stopping_rounds=7)
            
            # print('Starting predict...')
            pred = model.predict(val_X)
            mae = mean_absolute_error(val_y, pred)

        else:
            #sometimes different versions of LGBM have errors with naming, try bagging_freq
            steps.append(('LGBMregressor', lgb.LGBMRegressor(**params, random_state=42)))
            pipe = Pipeline(steps)
            pipe.fit(train_X,train_y)
            # predict: apply transforms to the data, and predict with the final estimator
            pred = pipe.predict(val_X)
            mae = mean_absolute_error(val_y, pred)
    else:
        return 'error'

    # return cross_val_score(clf, train_X, train_y, scoring='f1_weighted', cv = 5).mean()
    print(f"steps: {steps}")
 
    # f1 = cross_val_score(pipe, train_X, train_y, scoring='f1_weighted', cv = 5).mean()

    return {
        'loss': mae,
        'status': STATUS_OK,
        'version': __VERSION__,
        'selectedFeatures': list(train_X.columns),
        'model': model1
        }

### Find best model

In [1]:
# trialsMongoTargetClassification = MongoTrials("mongo://127.0.0.1:27017/hyperopt/jobs", exp_key="classification_target")

In [None]:
trials = Trials()

best = fmin(objective_classifier,                          
    space = space,                        
    algo = tpe.suggest,
    max_evals = 10,
    trials = trials)

print(best)

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.48, 'encoder': 'target', 'enrollmentoutlierremover': False, 'learning_rate': 0.005696150291105463, 'max_bin': 71, 'max_depth': 51, 'min_child_weight': 3.5068886736746507, 'min_split_gain': 7.515394846701819, 'n_estimators': 89, 'num_leaves': 5, 'objective': 'gamma', 'other_rate': 0.18689096978945197, 'preproc_algo': {'preproc_algo': 'no_preproc'}, 'reg_alpha': 3.3125703211320974, 'reg_lambda': 1.9797797894969542, 'scale': 0, 'subsample': 0.8, 'subsample_for_bin': 17907, 'timeoutlierremover': 4, 'top_rate': 0.38068588313733986, 'type': 'lgbm'}
Running: lgbm
time4
steps: [('LGBMregressor', LGBMRegressor(boosting_type='gbdt', class_weight='balanced',
              colsample_bytree=0.48, encoder='target', importance_type='split',
              learning_rate=0.005696150291105463, max_bin=71, max_depth=51,
              min_child_samples=20, min_child_weight=3.5068886736746507,
              min_split_gain=7.51539484

In [None]:
### ValueError: at least one array or dtype is required ... or somerimes 

# Dummy model

In [None]:
from sklearn.dummy import DummyRegressor

#create time series hold out set

#Use Time Series Split
X = new_data.sort_values(by="StartYear").copy()

X=X[X["StartYear"]>1995]
X=X[X["StartYear"]<2018]

print(f"Size of dataset: {len(X)} records")

Y= X["EnrollmentDuration"]
X = X.drop("EnrollmentDuration", axis =1)

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y[0:train_size], Y[train_size:len(Y)]

Size of dataset: 29258 records


In [None]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(df_train_X, df_train_y)
dummy_pred = dummy_regr.predict(df_test_X)
mae = mean_absolute_error(df_test_y, dummy_pred)
print(f"MAE: {mae}")

MAE: 21.583414590638146


In [None]:
from sklearn.dummy import DummyClassifier

#Use Time Series Split
X = new_data.sort_values(by="StartYear").copy()

X=X[X["StartYear"]>1995]
X=X[X["StartYear"]<2018]

print(f"Size of dataset: {len(X)} records")

Y= X[["EnrollmentDuration"]]
X = X.drop("EnrollmentDuration", axis =1)

label_transformer = LabelEncoder(strategy="equal_frequency", n_bins=7, labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
#label_transformer = LabelEncoder(strategy="equal_interval", frequency = 12)
Y_new = label_transformer.fit_transform(Y)
Y_new.head()

Size of dataset: 29258 records


Unnamed: 0,Start Value,Label,Code
0,1.0,A,5
1,10.0,B,14
2,18.0,C,22
3,26.0,D,31
4,36.0,E,42
5,49.0,F,59
6,70.0,G,159


Unnamed: 0,EnrollmentDuration,new
352,80,159
7457,124,159
299,134,159
302,87,159
6259,29,31
...,...,...
28550,4,5
28447,34,31
28453,27,31
28452,27,31


Unnamed: 0,EnrollmentDuration
352,159
7457,159
299,159
302,159
6259,31


In [None]:
#create time series hold out set

#Time series split
train_size = int(len(X) * 0.9)
df_train_X, df_test_X = X[0:train_size], X[train_size:len(X)]
df_train_y, df_test_y = Y[0:train_size], Y[train_size:len(Y)]

In [None]:
dummy_clas = DummyClassifier(strategy="stratified")
dummy_clas.fit(df_train_X, df_train_y)
dummy_pred = dummy_clas.predict(df_test_X)
mae = mean_absolute_error(df_test_y, dummy_pred)
print(f"MAE: {mae}")

MAE: 27.420027341079972
