Project: Forecasting Patient Enrolment for Clinical Trials

Supervisor: Niklas Frühauf, Sovanta

Authors:
*   Luka Biedebach
*   Weiyi Chen
*   Giang Hoang
*   Carolin Holtermann
*   Stefan Sousa

#Preparation and Data Retrieval



In [None]:
from tqdm import tqdm                        # show progress in iterations
import pandas as pd

#pd.set_option('display.max_columns', None)  # show all columns of dataframe

## Initialize MongoDB instances
Note: sometimes it does not work directly, you have to restart the runtime

In [None]:
from pymongo import MongoClient
import math
!pip install dnspython

# MongoDB Collections: trials, country, hospitals, populationDensity
client = MongoClient("mongodb+srv://sovanta:Si8T8TtsViHYenjx@clinicaltrials-exomh.mongodb.net/test?retryWrites=true&w=majority")
db = client['clinical-trials']

# MongoDB Collections: regional, universities, worldrank
client2 = MongoClient("mongodb+srv://sovanta:14FgeCSTCZF9HNVw@cluster0-3yb0b.mongodb.net/test?retryWrites=true&w=majority")
db2 = client2['clinical-trials']

##Retrieve clinical trials data

In [None]:
# Define Criteria and Projection
criteria = {
    '$and': [
      {'LocationCountry': { '$not': {'$size': 0}}},                                         # at least one country
      {'LocationFacility': { '$not': {'$size': 0}}},                                        # at least one facility
      {'Condition': { '$not': {'$size': 0}}},                                               # at least one condition
      {'EnrollmentCount': {'$ne': 0}},                                                      # Number of patients > 0
      {'EnrollmentCount': {'$ne': None}},                                                   # EnrollmentCount != Null (None) - Actually this feature doesn't have any missing value
      {'EligibilityCriteria': {'$ne': None}},                                               # Only trials with EligibilityCriteria
      {'EligibilityCriteria': {'$ne': []}},                                                 # Only trials with EligibilityCriteria
      {'EnrollmentDuration': {'$ne': 0}},                                                   # Duration > 0 (month)
      {'EnrollmentDuration': {'$ne': None}},                                                # EnrollmentDuration != Null (None)
      {'Phase': { '$ne': ['Phase 1'], '$ne': ['Early Phase 1'], '$ne': ['Not Applicable']}}, # Not accepting phase 1, early phase 1 and not applicable. Accepting multiple-phase studies
      {'NCTId': {'$ne': 'NCT00001132'}}                                                     # Exception
    ]}

projectionTrials = {'_id': 0,
              "NCTId": 1,
              "OfficialTitle": 1,
              "BriefSummary": 1,
              "OrgClass": 1,
              "Condition": 1,
              "LeadSponsorClass": 1,
              "CollaboratorClass": 1,
              "EligibilityCriteria": 1,
              "EnrollmentType": 1,
              "EnrollmentCount": 1,
              "Phase": 1,
              "HealthyVolunteers": 1,
              "Gender": 1,
              "StdAge": 1,
              "LocationFacility": 1,
              "LocationCity": 1,
              "LocationCountry": 1,
              "IsFDARegulatedDrug": 1,
              "IsFDARegulatedDevice": 1,
              "ArmGroupLabel": 1,
              "EnrollmentDuration": 1 ,
              "InterventionName": 1,
              "StartDate" : 1,   
              "DesignPrimaryPurpose" : 1,
              "MaximumAge" : 1,
              "MinimumAge" : 1,
              "OrgFullName": 1,
              "ConditionAncestorTerm": 1,
              "ConditionBrowseBranchAbbrev":1,
              "ConditionMeshId":1,
              "LocationState": 1,
              "LocationZip": 1,
              "LocationPopulationDensity": 1,
              "LeadSponsorName": 1,

              # Newly added features
              "InterventionType": 1,
              "CollaboratorName": 1,
              "ArmGroupType" : 1,
              "BaselineDenomCountGroupId": 1,
              "BaselineMeasureDispersionType" : 1,
              "DesignAllocation" : 1,
              "DesignInterventionModel" : 1,
              "Keyword" : 1,

              "EventsTimeFrame": 1,
              "FlowDropWithdrawType": 1,
              "FlowGroupDescription": 1,
              "FlowGroupTitle": 1,
              "FlowMilestoneType": 1,
              "FlowPeriodTitle": 1,
              "FlowRecruitmentDetails": 1,
              "ArmGroupDescription": 1,
              "ArmGroupInterventionName": 1,
              "ArmGroupLabel": 1,
              "BaselineCategoryTitle": 1,
              "BaselineClassTitle": 1,
              "BaselineGroupDescription": 1,
              "BaselineGroupTitle": 1,
              "BaselineMeasureTitle": 1,
              "BaselineMeasureUnitOfMeasure": 1  
              }

#df_raw = pd.DataFrame(list(db.trials.find(filter=criteria, projection=projectionTrials).limit(1000)))
df_raw = pd.DataFrame(list(db.trials.find(filter=criteria, projection=projectionTrials)))

print(f"Number of trials: {len(df_raw)}")
df_raw.head(5)

Number of trials: 30973


Unnamed: 0,NCTId,OrgFullName,OrgClass,Phase,StartDate,Condition,ConditionAncestorTerm,ConditionBrowseBranchAbbrev,ConditionMeshId,LeadSponsorName,LeadSponsorClass,CollaboratorName,CollaboratorClass,EligibilityCriteria,EnrollmentCount,EnrollmentType,HealthyVolunteers,Gender,StdAge,MinimumAge,MaximumAge,LocationFacility,LocationCity,LocationState,LocationZip,LocationCountry,InterventionType,InterventionName,IsFDARegulatedDevice,IsFDARegulatedDrug,EventsTimeFrame,FlowDropWithdrawType,FlowGroupDescription,FlowGroupTitle,FlowMilestoneType,FlowPeriodTitle,FlowRecruitmentDetails,ArmGroupDescription,ArmGroupInterventionName,ArmGroupLabel,ArmGroupType,BaselineCategoryTitle,BaselineClassTitle,BaselineDenomCountGroupId,BaselineGroupDescription,BaselineGroupTitle,BaselineMeasureDispersionType,BaselineMeasureTitle,BaselineMeasureUnitOfMeasure,DesignAllocation,DesignInterventionModel,DesignPrimaryPurpose,Keyword,EnrollmentDuration,LocationPopulationDensity
0,NCT00000143,Johns Hopkins Bloomberg School of Public Health,OTHER,[Phase 3],May 1997,"[Cytomegalovirus Retinitis, HIV Infections]","[Virus Diseases, Retinal Diseases, Eye Disease...","[BC01, All, BC02, BC20, BC11, Rare]","[D000017726, D000012173]",[Johns Hopkins Bloomberg School of Public Health],[OTHER],[],[],[Inclusion criteria:\n\nAge 13 years or older\...,61,Actual,No,All,"[Child, Adult, Older Adult]",13 Years,,"[Department of Ophthalmology, University of Ca...","[Irvine, La Jolla, Los Angeles, Los Angeles, S...","[California, California, California, Californi...","[92697-4375, 92093-0946, 90033, 90095-7003, 94...","[United States, United States, United States, ...","[Device, Drug]","[Ganciclovir implant and oral ganciclovir, Cid...",,,3 years,[],[Ganciclovir device and oral dose of Ganciclov...,"[Ganciclovir Implant and Oral Ganciclovir, Cid...","[STARTED, COMPLETED, NOT COMPLETED]",[Overall Study],June 1997,[Ganciclovir device and oral dose of Ganciclov...,[Device: Ganciclovir implant and oral ganciclo...,"[Ganciclovir implant and oral ganciclovir, Cid...","[Experimental, Experimental]","[<=18 years, Between 18 and 65 years, >=65 yea...",[United States],"[BG000, BG001, BG002]",[Ganciclovir device and oral dose of Ganciclov...,"[Ganciclovir Implant and Oral Ganciclovir, Cid...",[],"[Age, Categorical, Sex: Female, Male, Region o...","[Participants, Participants, participants]",Randomized,Parallel Assignment,Treatment,[],37,"[2114.408936, 1189.4198, 2287.767578, 3559.750..."
1,NCT00000170,Jaeb Center for Health Research,OTHER,[Phase 3],April 1999,[Amblyopia],"[Brain Diseases, Central Nervous System Diseas...","[BC10, BC11, BC23, All]",[D000000550],[Jaeb Center for Health Research],[OTHER],[National Eye Institute (NEI)],[NIH],[Inclusion Criteria:\n\nPatients must be 7 yea...,419,Actual,No,All,[Child],,6 Years,[Wilmer Eye Institute],[Baltimore],[Maryland],[21287-9028],[United States],"[Drug, Device]","[Atropine, Eye Patch]",,,,[],[],[],[],[],,[Atropine],"[Device: Eye Patch, Drug: Atropine]","[Patching, Atropine]","[Active Comparator, Active Comparator]",[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[Amblyopia, patching, atropine]",172,[1717.821167]
2,NCT00000177,National Institute on Aging (NIA),NIH,[Phase 3],October 1995,[Alzheimer Disease],"[Dementia, Brain Diseases, Central Nervous Sys...","[BC10, BXM, All, Rare]",[D000000544],[National Institute on Aging (NIA)],[NIH],[],[],[Inclusion Criteria:\n\nWomen with a diagnosis...,120,,No,Female,"[Adult, Older Adult]",60 Years,,"[University of Alabama, Birmingham, University...","[Birmingham, San Diego, Jacksonville, Tampa, A...","[Alabama, California, Florida, Florida, Georgi...","[35294-0017, 92093, 32225, 30329, 60612, 62702...","[United States, United States, United States, ...",[Drug],[Estrogen],,,,[],[],[],[],[],,[],[],[],[],[],[],[],[],[],[],[],[],Randomized,,Treatment,"[Alzheimer's disease, Estrogen]",39,"[230.015213, 1189.4198, 567.188477, 494.567383..."
3,NCT00000188,University of Pennsylvania,OTHER,[Phase 2],September 1994,[Cocaine-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...","[BC25, BXM, All]",[D000019970],[University of Pennsylvania],[OTHER],[National Institute on Drug Abuse (NIDA)],[NIH],[Please contact site for information.],50,Actual,No,All,"[Child, Adult, Older Adult]",,,[PDVAMC Treatment Research Center],[Philadelphia],[Pennsylvania],[19104],[United States],[Drug],[Selegiline],,,,[],[],[],[],[],,[],[],[],[],[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[Cocaine Dependence, Selegiline]",13,[2131.81665]
4,NCT00000189,University of Pennsylvania,OTHER,[Phase 2],January 1990,[Cocaine-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...","[BC25, BXM, All]",[D000019970],[University of Pennsylvania],[OTHER],[National Institute on Drug Abuse (NIDA)],[NIH],[Please contact site for information.],41,Actual,No,Male,"[Child, Adult, Older Adult]",,,[University of Pennsylvania],[Philadelphia],[Pennsylvania],[19104 6178],[United States],[Drug],[Gepirone],,,,[],[],[],[],[],,[],[],[],[],[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[Cocaine Dependence, Gepirone]",13,[2131.81665]


In [None]:
df_raw["StartDate"]

0              May 1997
1            April 1999
2          October 1995
3        September 1994
4          January 1990
              ...      
30968         July 2018
30969        April 2019
30970     November 2018
30971         July 2018
30972    September 2012
Name: StartDate, Length: 30973, dtype: object

##Removing inconsistencies in clinical trials data

In [None]:
indexis = set()
inc_counter, loc_counter, con_counter = 0, 0, 0
for index, row in df_raw.iterrows():
    if len(row.LocationFacility) != len(row.LocationCity) or len(row.LocationFacility) != len(row.LocationCountry):
            indexis.add(index)
            inc_counter += 1

    for loc in row.LocationFacility:
        if loc.startswith("For additional information regarding investigative sites for this trial,"):
            indexis.add(index)
            loc_counter += 1
    
    if row.EligibilityCriteria[0].startswith("Please contact site for information"):
        indexis.add(index)
        con_counter += 1

df_raw.drop(indexis, inplace=True)
print(f"Location Facility/City/Country Inconsistency: {inc_counter}")
print(f"Location Facility Inconsistency: {loc_counter}")
print(f"Condition Inconsistency: {con_counter}")
print(f"Number of trials after removing inconsistencies: {len(df_raw)}")

Location Facility Inconsistency:  0
Condition Inconsistency:  18
Number of trials after removing inconsistencies: 982


##Retrieve country data

In [None]:
country_projection = {"_id": 0, 
            "urbanPopulation" : 1,
            "countryName": 1,
            "population": 1,
            "density": 1, 
            "sizeInKm2": 1, 
            "lifeExpectancy": 1, 
            "GDP": 1, 
            "migrantsNet": 1, 
            "worldshare": 1,
            "unemploymentRate": 1,
            "hospitalBed": 1,
            "healthExpenditure": 1,
            "fertilityRate": 1,
            "medianAge": 1
}

df_dbcountry = pd.DataFrame(list(db.country.find(filter={}, projection=country_projection)))

print(f"Number of countries: {len(df_dbcountry)}")
df_dbcountry.head(5)

Number of countries: 235


Unnamed: 0,countryName,population,lifeExpectancy,GDP,unemploymentRate,hospitalBed,healthExpenditure,density,fertilityRate,medianAge,migrantsNet,sizeInKm2,urbanPopulation,worldshare
0,Angola,32866272.0,60.0,105751000000.0,6.0,0.0,2.0,26.0,5.6,17.0,6413.0,1246700.0,67.0,0.42
1,Bhutan,771608.0,71.0,2446674000.0,2.0,1.0,3.0,20.0,2.0,28.0,320.0,38117.0,46.0,0.01
2,Colombia,50882891.0,77.0,331047000000.0,9.0,1.0,7.0,46.0,1.8,31.0,204796.0,1109500.0,80.0,0.65
3,Cayman Islands,65722.0,82.0,5141834000.0,,3.0,,274.0,,,,240.0,97.0,0.0
4,Spain,46754778.0,83.0,1419042000000.0,13.0,3.0,8.0,94.0,1.3,1.3,40000.0,498800.0,80.0,0.6


##Retrieve hospital data

In [None]:
hospitaldata = pd.DataFrame(list(db.hospitals.find(filter={})))

#Change column Name
df_hospital = hospitaldata[['Name', 'World Rank']]
df_hospital.columns = ["Name", "WorldRank"]

print(f"Number of hospitals: {len(df_hospital)}")
df_hospital.head(5)

Number of hospitals: 29259


Unnamed: 0,Name,WorldRank
0,Cleveland Clinic,1
1,St Jude Children's Research Hospital,2
2,Johns Hopkins Medicine,3
3,Mayo Clinic Scottsdale AZ,4
4,University of Maryland Medical Center,5


##Retrieve regional data

In [None]:
df_regional = pd.DataFrame(list(db2.regional.find(filter={})))

df_regional["Youth"]= pd.to_numeric(df_regional["Youth"])
df_regional["Working"]= pd.to_numeric(df_regional["Working"])
df_regional["Elderly"]= pd.to_numeric(df_regional["Elderly"])
df_regional = df_regional.drop_duplicates(subset=["Area"])

todrop = []
for i, row in df_regional.iterrows():
    if math.isnan(row["Youth"])==True:
        todrop.append(i)

df_regional = df_regional.drop(todrop)

# is it possible to extract 'Area' locally where it is used?
countrylist = df_regional['Area']

print(f"Number of areas: {len(df_regional)}")
df_regional.head(5)

Number of areas: 686


Unnamed: 0,_id,Area,Density,Youth,Working,Elderly
0,5f2963b91268ea2905dbd1ae,San Antonio,115,531242.0,1675347.0,331263.0
1,5f2963b91268ea2905dbd1b3,Palma de Mallorca,340,104704.0,470334.0,110748.0
2,5f2963b91268ea2905dbd1b4,Montreal,372,732421.0,3021024.0,761042.0
3,5f2963b91268ea2905dbd1b5,Portsmouth,2763,92068.0,346692.0,103283.0
4,5f2963b91268ea2905dbd1ac,London,1785,98405.0,400749.0,102920.0


##Retrieve worldrank data

In [None]:
df_worldrank = pd.DataFrame(list(db2.worldrank.find(filter={})))
worldrank_cols = ["Facility", "WorldRank"]
df_worldrank = df_worldrank[worldrank_cols]
df_worldrank.set_index("Facility", inplace = True)

print(f"Number of facilities with worldrank: {len(df_worldrank)}")
df_worldrank.head(5)

Number of facilities with worldrank: 4046


Unnamed: 0_level_0,WorldRank
Facility,Unnamed: 1_level_1
massachusetts general hospital,7
university california san diego,16
medical college wisconsin,536
university kentucky,132
wake forest university,349


##Retrieve university data

In [None]:
df_uni = pd.DataFrame(list(db2.universities.find(filter={})))
df_uni = df_uni[["Name", "WorldRank"]]

print(f"Number of universities: {len(df_uni)}")
df_uni.head(5)

Number of universities: 12014


Unnamed: 0,Name,WorldRank
0,University of California Berkeley,4
1,University of Washington,5
2,Harvard University,1
3,Columbia University New York,9
4,(2) Johns Hopkins University,8


In [None]:
#Create Facility List to search through
frames=[df_hospital, df_uni]
facility = pd.concat(frames, ignore_index=True)

todrop=[]
for i, fac in facility.iterrows():
    if str(fac["WorldRank"]).isdigit() == False:
        todrop.append(i)
        
facility = facility.drop(todrop)
facility["WorldRank"]=pd.to_numeric(facility["WorldRank"])
facilitylist= facility["Name"]

---
#Custom Transformers

In [None]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from functools import reduce

import numpy as np
import string
import json
import re

!pip install python-Levenshtein
import Levenshtein

Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |██████▊                         | 10kB 23.0MB/s eta 0:00:01[K     |█████████████▌                  | 20kB 3.4MB/s eta 0:00:01[K     |████████████████████▏           | 30kB 4.6MB/s eta 0:00:01[K     |███████████████████████████     | 40kB 4.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.8MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.0-cp36-cp36m-linux_x86_64.whl size=144801 sha256=d0bae39f44980404ac34b9722d99cfcb0d6096c9cb5f67ffc1b695e72c31468a
  Stored in directory: /root/.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-Levenshtein
Installin

##General Transformers

In [None]:
class FeatureSelector(TransformerMixin, BaseEstimator):
    """
    Returns a dataframe with selected features.
    
    Parameters
    --------
        cols
            columns to be selected.

    Attributes
    --------

    Notes
    --------
    """

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        """ Do nothing and return the estimator unchanged
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]

        y : Ignored
        """
        return self

    def transform(self, X):
        """ Selects defined features
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
        
        Returns
        -------
        X_new : pandas dataframe with selected features
        """
        X_new = X[self.cols] # or: X_new = X.loc[:, self.cols].copy()
        return X_new

class FeatureExcluder(TransformerMixin, BaseEstimator):
    """
    Excludes the defined features.
    
    Parameters
    --------
        cols
            columns to be excluded.

    Attributes
    --------
        raw_features
            columns to be excluded.

    Notes
    --------
    """

    def __init__(self, cols):
        self.raw_features = list(cols)

    def fit(self, X, y=None):
        """ Do nothing and return the estimator unchanged.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
        
        y : Ignored
        """
        return self

    def transform(self, X):
        """ Excludes defined features.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
        
        Returns
        -------
        X_new : pandas dataframe without raw features
        """
        tmp = [set(self.raw_features), set(list(X.columns))]
        schnitt = set.intersection(*tmp)
        X_new = X.drop(list(schnitt), axis=1)

        #for attr in X.columns:
        #   if "_x" in attr or "_y" in attr:
        #       X_new.drop(attr, axis=1, inplace=True)

        return X_new

class FeatureUnion(TransformerMixin, BaseEstimator):
    """
    Applies fit and transform on multiple transformers and merges their outputs.
    
    Parameters
    --------
        transformer_list
            list of transformers

    Attributes
    --------

    Notes
    --------
    """
    
    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def fit(self, X, y=None):
        """ Fit all transformers.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
        y : Ignored
        """
        for (name, transformer) in self.transformer_list:
            transformer.fit(X, y)
        return self
      
    def transform(self, X):
        """ Run transform on all transformers and merge dataframes into a single one.

        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
        
        Returns
        -------
        X_new : merged pandas dataframe
        """
        Xts = [transformer.transform(X) for _, transformer in self.transformer_list]
        X_new = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        return X_new

class Debug(TransformerMixin, BaseEstimator):
    """
    Can be used as an intermediate step to check the status of the dataframe in the pipeline.
    
    Parameters
    --------
        debug
            String describing the debug step in pipeline.

    Attributes
    --------

    Notes
    --------
    - this transformer is used by developers to understand what is happening inside the pipeline
    """

    def __init__(self, debug):
        #print(f"{debug} initiated")
        self.name = debug

    def fit(self, X, y=None):
        #print(f"{self.name} - Fit executed")
        #print("Dataframe Type: ", type(X))
        return self

    def transform(self, X):
        print(f"{self.name} - Transform executed")
        #print("Dataframe Type: ", type(X))
        X.to_csv(f"dataframe_{self.name}_step.csv", sep=";")
        print(list(X.columns))
        #display(X.head(5))
        return X.copy()

##Transformers for single categorical values

In [None]:
class MissingStringsTransformer(TransformerMixin, BaseEstimator):
    """
    Replaces missing values using SimpleImputer and returns a pandas dataframe.

    Parameters
    --------       
        strategy : string, default = 'most_frequent'
            String value defining the strategy of the simple imputer. Is only used for the simple imputer
            One can decide between: 
            - constant -> filling in a constant value
            - most_frequent -> calculate the most frequent value and inserting that value
        
        fill_value : string, default = ''
            Value defining the constant value that will be inserted when the Simple imputation strategy "constant" is selected

    Attributes
    --------
        imp 
          Placeholder to contain the created Imputer instance of the fit method.

    Notes
    --------
    https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
    
    """
    
    def __init__(self, 
                 strategy = 'most_frequent', 
                 fill_value= "",
                 n_neighbors = 2,
                 weights = "uniform"):
        self.strategy = strategy
        self.fill_value = fill_value    # in case of strategy 'constant'
        self.imp = None

    def fit(self, X, y=None):
        if self.strategy == 'constant':
            self.imp = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value, missing_values=None)
        elif self.strategy == 'most_frequent':
            self.imp = SimpleImputer(strategy=self.strategy, fill_value='Missing', missing_values=None)
        else:
            raise Exception("Please define one strategy - 'most_frequent' or 'constant'")
        self.imp.fit(X)
        return self

    def transform(self, X):
        X_imp = self.imp.transform(X)
        X_new = pd.DataFrame(X_imp, index=X.index, columns=X.columns)
        return X_new

class SingleOneHotEncoder(TransformerMixin, BaseEstimator):
    """
    Applies One Hot Encoding on features with only single values (no lists).

    Parameters
    --------

    Attributes
    --------
        dv 
          Placeholder to contain the created DictVectorizer instance of the fit method.
        
        nan_cols
          Placeholder to contain empty columns to be excluded. Is just set once to make sure that always the same columns are removed.

    Notes
    --------
    https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
    
    """

    def __init__(self):
        self.dv = None
        self.nan_cols = None

    def fit(self, X, y=None):
        X_dict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(X_dict)
        return self

    def transform(self, X):
        X_dict = X.to_dict('records')
        Xt = self.dv.transform(X_dict)
        cols = self.dv.get_feature_names()
        X_new = pd.DataFrame(Xt, index=X.index, columns=cols)
        
        if not self.nan_cols:
            self.nan_cols = [c for c in cols if '=' not in c]
        
        X_new = X_new.drop(self.nan_cols, axis=1)
        return X_new

## Transformers for list of categorical values

In [None]:
class MultipleOneHotEncoder(TransformerMixin, BaseEstimator):
    """ Applies One Hot Encoding on features with lists as values.

    Parameters
    --------

    Attributes
    --------
        features 
          Placeholder to contain a dictionary consisting of feature name and possible values of each feature.

    Notes
    --------
    
    """

    def __init__(self):
        self.features = {}

    def fit(self, X, y=None):
        for feature in list(X.columns):
            distinct_values = set()
            for index, row in X.iterrows():
                for value in row[feature]: 
                    if value not in distinct_values: distinct_values.add(value)
            self.features[feature] = list(distinct_values)
        return self

    def transform(self, X):
        X_new = X.copy()
        for index, row in X.iterrows():
            for feature in self.features.keys():
                for value in self.features[feature]:
                    X_new[f"{feature}={value}"] = 0
                    if value in row[feature]: X_new.at[index, f"{feature}={value}"] = 1 

        return X_new

class MultipleTopOneHotEncoder(TransformerMixin, BaseEstimator):
    """ Applies One Hot Encoding on features with lists as values taking only the top X Values.

    Parameters
    --------
        strategy : string, default = 'top'
            String value defining the strategy. One can decide between: 
            - top : one hot encodes top X values of all possible values according to the occurrence in trials
            - min_value : one hot encodes all values, with a minimum of X occurences in trials
        
        top : integer, default = 20
            Number of top values to take for strategy 'top'

        min_value : integer, default = None
            Number of minimum occurences in order to be considered top, used for stratefy 'min_value'

    Attributes
    --------
        mlb 
          Placeholder to contain the created MultipleOneHotEncoder instance of the fit method.

    Notes
    --------
    
    """

    def __init__(self, strategie="top", top=20, min_value=None):
        self.mlb = None
        self.strategie = strategie
        self.top = top
        self.min_value = min_value

    def fit(self, X, y=None):
        values = {}
        for feature in list(X.columns):
            stats = {}
            for value_list in X[feature]:
                for value in set(value_list):
                    if value in stats: stats[value] += 1
                    else: stats[value] = 1
            if self.strategie == "min_value" and self.min_value is not None:
                values[feature] = {k: v for k, v in sorted(stats.items(),reverse=True, key=lambda item: item[1]) if v > self.min_value}
            elif self.strategie == "top" and self.top is not None:
                values[feature] = {k: v for k, v in sorted(stats.items(),reverse=True, key=lambda item: item[1])[:self.top]}
            else:
                raise Exception("Please define one strategy - 'top' or 'min_value'") 
        self.values = values
        #print(f"Ranking: ")
        #print(json.dumps(values, indent=2))

        X_tmp = X.copy()
        self.mlb = MultipleOneHotEncoder()
        new_columns = []
        for feature in self.values.keys():
            new_columns.append(f"{feature}_top")
            X_tmp[f"{feature}_top"] = [[x for x in set(value) if x in list(self.values[feature].keys())] for value in X[feature]]
        self.mlb.fit(X_tmp[new_columns])
        return self

    def transform(self, X):
        X_tmp = X.copy()
        new_columns = []
        for feature in self.values.keys():
            new_columns.append(f"{feature}_top")
            X_tmp[f"{feature}_top"] = [[x for x in set(value) if x in list(self.values[feature].keys())] for value in X[feature]]
        
        X_new_partial = self.mlb.transform(X_tmp[new_columns])
        X_new_partial.drop(new_columns, axis=1, inplace=True)
        X_new = X.merge(X_new_partial, left_index=True, right_index=True)
        return X_new

##Transformers for numerical values

In [None]:
class MissingValuesTransformer(TransformerMixin, BaseEstimator):
    """
    Replaces missing values using SimpleImputer, KNNImputer or IterativeImputer and returns a pandas dataframe.

    Parameters
    --------

        imputer : string, default = 'SimpleImputer'
            String value defining the impution strategy that should be applied.
            One can decide between:
            - SimpleImputer = impute values in the i-th feature dimension using only non-missing values 
                              in that feature dimension
            - KNNImputer = missing values are imputed using the mean value from n_neighbors nearest neighbors
            - IterativeImputer = multivariate imputation algorithm using entire set of available feature 
                                 dimensions to estimate the missing values
            
        strategy : string, default = 'mean'
            String value defining the strategy of the simple imputer. Is only used for the simple imputer
            One can decide between: 
            - mean -> calculate the mean value and inserting that value
            - median -> calculate the mean value and inserting that value
        
        fill_value : string, default = ''
            Value defining the constant value that will be inserted when the Simple imputation strategy "constant" is selected

        n_neighbors : integer, default = 2
            Integer value of the number of neighbors that is taken into account to determine the most frequent or average value of the k nearest neighbors.

        weights : string, default = "uniform"
            Parameter that defines how the values of the k nearest neighbors are weighted. 
            One can choose between:
            - uniform -> all neighbors are equally important
            - distance -> closer neighbors are higher weighted than distanced ones
        
        max_iter : int
            Maximum number of imputation rounds to perform before returning the imputations.
            
        initial_strategy : str, default=’mean’
            Which strategy to use to initialize the missing values in iterative imputer. 
            One can decide between: 
            - mean -> calculate the mean value and inserting that value
            - median -> calculate the mean value and inserting that value

    Attributes
    --------
        imp 
          Placeholder to contain the created Imputer instance of the fit method.

    Notes
    --------

    https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
    https://levelup.gitconnected.com/scikit-learn-python-6-useful-tricks-for-data-scientists-1a0a502a6aa3
    
    """
    
    def __init__(self, 
                 imputer = 'SimpleImputer',
                 strategy = 'mean', 
                 fill_value= "",
                 n_neighbors = 2,
                 weights = "uniform",
                 max_iter = 10,
                 initial_strategy = 'mean'):
        self.imputer = imputer
        self.strategy = strategy
        self.fill_value = fill_value    # in case of strategy 'constant'
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.imp = None

    def fit(self, X, y=None):
        if self.imputer == 'SimpleImputer':
            if self.strategy == 'mean':
                self.imp = SimpleImputer(strategy=self.strategy)
            elif self.strategy == 'median':
                self.imp = SimpleImputer(strategy=self.strategy)
            else:
                raise Exception("Please define one strategy - 'mean' or 'median'")
        elif self.imputer == 'KNNImputer':
            if self.weights not in ['uniform', 'distance']:
                raise Exception("Please provide a valid weighting: 'uniform' or 'distance'")
            if not isinstance(self.n_neighbors, (int, float, complex)):
                raise Exception("Please provide a valid number for n_neighbors")
            self.imp = KNNImputer(n_neighbors=self.n_neighbors, weights=self.weights) 
        elif self.imputer == 'IterativeImputer':
            if self.initial_strategy not in ['mean', 'median']:
                raise Exception("Please define one strategy - 'mean' or 'median'")
            if not isinstance(self.max_iter, (int, float, complex)):
                raise Exception("Please provide a valid number for max_iter")
            self.imp = IterativeImputer(max_iter=self.max_iter, random_state=0, initial_strategy=self.initial_strategy)
        else: 
            raise Exception("Please define one imputer - 'SimpleImputer', 'KNNImputer' or 'IterativeImputer'")
        self.imp.fit(X)
        return self

    def transform(self, X):
        X_imp = self.imp.transform(X)
        new_columns = [f"{col}_new" for col in X.columns]
        X_new = pd.DataFrame(X_imp, index=X.index, columns=new_columns)
        return X_new

## Transformers for age values

In [None]:
class ToYearTransformer(TransformerMixin, BaseEstimator):
    """ Replaces different time frame with a number, save it as factors then multiply them,
      returns a pandas dataframe """
    def __init__(self, conversion_factors= {
                                'Years': 1,
                                'Year': 1,
                                'Months': 1/12,
                                'Month': 1/12,
                                'Weeks': 1/52,
                                'Week': 1/52,
                                'Days': 1/365 ,
                                'Day': 1/365,
                                'Hours': 1/8760,
                                'Hour': 1/8760
                                }):

        self._conversion_factors = conversion_factors
    def fit(self,X,y = None):
        return self
    
    def transform(self, X):

        for feature in X.columns:
            X_new = X.copy()
            X_new[feature] = X_new[feature].astype(str)
            for i in range(len(X_new)):
                for string, factor in self._conversion_factors.items(): 
                    for n in range(len(X.columns)):
                        if string in (X_new.iloc[i][n]):              
                            X_new.iloc[i][n] = (X_new.iloc[i][n]).replace(string, '')
                            X_new.iloc[i][n] = (X_new.iloc[i][n]).strip()
                            X_new.iloc[i][n] = float(X_new.iloc[i][n]) * factor
                            X_new.iloc[i][n] = str(X_new.iloc[i][n])


            X_new["MaximumAgeValue"] = X_new['MaximumAge'].astype(float)
            X_new["MinimumAgeValue"] = X_new['MinimumAge'].astype(float)

        return X_new

## Transformer for StartMonth

In [None]:
class StartMonthTransformer(TransformerMixin, BaseEstimator):
    """
    Class that extracts the month from the start date and creates a new column "StartMonth" in the data set containing
    the extracted month as float value.
    
    """
    def __init__(self):
        pass
    
    def fit(self,X,y = None):
        return self
    
    def transform(self, X):
        # Copy the passed data frame
        X_new = X.copy()
        # Create a new column "StartMonth" and remove the year from the StartDate feature
        X_new['StartMonth'] = X_new['StartDate'].str.replace('0|1|2|3|4|5|6|7|8|9| |  ','')
        
        # Transform each month to a float value
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('January','1')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('February','2')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('March','3')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('April','4')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('May','5')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('June','6')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('July','7')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('August','8')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('September','9')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('October','10')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('November','11')
        X_new['StartMonth'] = X_new['StartMonth'].str.replace('December','12')
        X_new['StartMonth'] = X_new['StartMonth'].astype(float)
        return X_new

## Transformer for StartYear 

In [None]:
class StartYearTransformer(TransformerMixin, BaseEstimator):
    """
    Class that extracts the year from the start date and creates a new column "StartYear" in the data set containing
    the extracted startyear.
    
    """
    def __init__(self):
          pass
    
    def fit(self,X,y = None):
        return self
    
    def transform(self, X):
        # Copy the passed data frame
        X_new = X.copy()
        
        # Remove the month from the StartDate feature
        X_new['StartYear'] = X_new['StartDate'].str.replace('January|February|March|April|May|June|July|August|September|October|November|December', '')
        
        # Save the year as float value
        X_new['StartYear'] = (X_new['StartYear']).astype(float)
        X_new = X_new.drop(columns=['StartDate'])
        return X_new

##Transformer to remove outliers in Start and End Date

In [None]:
#Custom transformer that transforms data set to only contain data after the timeperiod defined
class TimeOutlierRemover( BaseEstimator, TransformerMixin ):
    """
    Class that removes records from the data set outside a predefined time range.

    Parameters
    --------

        startYear : integer, default = 1995
            Integer defining the first year the data set should contain study records of. 
            
        endYear : integer, default = 2018
            Integer defining the last year the data set should contain study records of. 
            
    """
    
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, startYear = 1995, endYear = 2018):
        self._startYear = startYear
        self._endYear = endYear

    def fit( self, X, y = None ):
        return self
    
    def transform(self, X , y = None ):
        # Copy the passed data frame
        X_new = X.copy()
        # Exclude all data that is not in the range of start and end year
        X_new = X_new[(X_new['StartYear'] >= self._startYear) & (X_new['StartYear'] <= self._endYear)]
        # Reset the index of the remaining data set
        X_new = X_new.reset_index(drop=True)
        return X_new

##Transformer to remove outliers in Enrollment Count

In [None]:
#Custom transformer that transforms data set to remove outliers in the EnrollmentCount
# !!! Must be called after OHE
class EnrollmentOutlierRemover( BaseEstimator, TransformerMixin ):
    """
    Class for performing outlier removal in the feature "EnrollmentCount" based on basic statistics of the MAD or IQR.
    To take into account the difference in the enrollment count of studies in different phases, the MAD and IQR is 
    calculated for each possible phase individually.

    Parameters
    --------

        strategy : string, default = "IQR"
            String value defining the strategy of how the range of "normal" values is determined. 
            One can choose between two strategies:
                - "IQR" = Interquartile range
                    Here, after the median value of each phase was calculated, the interquartile range between the 25% and
                    the 75% quartile is calculated. The "normal" range of values is determined by taking the 
                    median +/- 1.5 * IQR.
                - "MAD" = Median Absolute Deviation
                    Here, after the median value of each phase was calculated, the absolute deviation of all values to the
                    median is calculated and the median of those values forms the MAD. The "normal" range of values is 
                    determined by taking the median +/- 2 * MAD.
        

    Attributes
    --------

    range : dict
        Dictionary storing the precalculated "normal" value ranges for each of the phases.

    phases : list
        List of strings defining the keys for the different phases in order to create a unique key in the dictionary.
        
    phase_series : dict
        Dictionary storing the filtered series of EnrollmentCount values for the different phases.
        
        """
    
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, strategy = "IQR"):
        self._strategy = strategy
        self._range = {}
        self._phases = ['phase1', 'phase1_2', 'phase2', 'phase2_3', 'phase3']
        self._phase_series = {}

    def fit(self, X, y = None ):
        # Get data of the different phases
        self._phase_series['phase1'] = X.loc[(X['Phase=Phase 1'] == 1) & (X["Phase=Phase 2"] == 0)]['EnrollmentCount_new']
        self._phase_series['phase1_2'] = X.loc[(X['Phase=Phase 1'] == 1) & (X["Phase=Phase 2"] == 1)]['EnrollmentCount_new']
        self._phase_series['phase2'] = X.loc[(X['Phase=Phase 1'] == 0) & (X["Phase=Phase 2"] == 1) & (X["Phase=Phase 3"] == 0)]['EnrollmentCount_new']
        self._phase_series['phase2_3'] = X.loc[ (X["Phase=Phase 2"] == 1) & (X["Phase=Phase 3"] == 1)]['EnrollmentCount_new']
        self._phase_series['phase3'] = X.loc[ (X["Phase=Phase 2"] == 0) & (X["Phase=Phase 3"] == 1)]['EnrollmentCount_new']
        
        # Calculcate outlier ranges
        if self._strategy == "IQR":
            for i in self._phases:
                IQR = self._phase_series[i].quantile(0.75) - self._phase_series[i].quantile(0.25)
                self._range[i] = [self._phase_series[i].median() - (1.5 * IQR), self._phase_series[i].median() + (1.5 * IQR)]
                
                
        elif self._strategy == "MAD":
            for i in self._phases:
                MAD = self._phase_series[i].mad()
                self._range[i] = [self._phase_series[i].median() - (2 * MAD), self._phase_series[i].median() + (2 * MAD)]
        
        else:
            raise ValueError("""Strategy must be either 'IQR' or 'MAD' """)
            
        
        return self

    
    def transform(self, X , y = None ):
        X_new = X.copy()
        # Remove outliers that don't lie in the specified value range of "normal" values
        for index, row in X_new.iterrows():
            if row['Phase=Phase 1'] == 1 and row['Phase=Phase 2'] == 0:
                if row['EnrollmentCount_new'] < self._range['phase1'][0] or row['EnrollmentCount_new'] > self._range['phase1'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 1'] == 1 and row['Phase=Phase 2'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase1_2'][0] or row['EnrollmentCount_new'] > self._range['phase1_2'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 1'] == 0 and row['Phase=Phase 2'] == 1 and row['Phase=Phase 3'] == 0:
                if row['EnrollmentCount_new'] < self._range['phase2'][0] or row['EnrollmentCount_new'] > self._range['phase2'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 2'] == 1 and row['Phase=Phase 3'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase2_3'][0] or row['EnrollmentCount_new'] > self._range['phase2_3'][1]:
                    X_new.drop([index], inplace = True)
            if row['Phase=Phase 2'] == 0 and row['Phase=Phase 3'] == 1:
                if row['EnrollmentCount_new'] < self._range['phase3'][0] or row['EnrollmentCount_new'] > self._range['phase3'][1]:
                    X_new.drop([index], inplace = True)
        return X_new

In [None]:
df_test = df_raw[['Phase']]
instance = MultipleOneHotEncoder()
new = instance.fit_transform(df_test)
display(new)
new.drop(columns = ['Phase'], inplace = True)
df_test = df_raw.copy()
df_test = df_test.join(new)
df_test.drop(columns = ['Phase'], inplace = True)

NameError: ignored

In [None]:
display(df_test)
display(df_test.dtypes)

instance = EnrollmentOutlierRemover(strategy = 'IQR')
new = instance.fit_transform(df_test)
display(new)

Unnamed: 0,NCTId,OrgFullName,OrgClass,StartDate,Condition,ConditionAncestorTerm,ConditionBrowseBranchAbbrev,ConditionMeshId,LeadSponsorName,LeadSponsorClass,CollaboratorName,CollaboratorClass,EligibilityCriteria,EnrollmentCount,EnrollmentType,HealthyVolunteers,Gender,StdAge,MinimumAge,MaximumAge,LocationFacility,LocationCity,LocationState,LocationZip,LocationCountry,InterventionType,InterventionName,IsFDARegulatedDevice,IsFDARegulatedDrug,EventsTimeFrame,FlowDropWithdrawType,FlowGroupDescription,FlowGroupTitle,FlowMilestoneType,FlowPeriodTitle,FlowRecruitmentDetails,ArmGroupDescription,ArmGroupInterventionName,ArmGroupLabel,ArmGroupType,BaselineCategoryTitle,BaselineClassTitle,BaselineDenomCountGroupId,BaselineGroupDescription,BaselineGroupTitle,BaselineMeasureDispersionType,BaselineMeasureTitle,BaselineMeasureUnitOfMeasure,DesignAllocation,DesignInterventionModel,DesignPrimaryPurpose,Keyword,EnrollmentDuration,LocationPopulationDensity,Phase=Phase 3,Phase=Phase 2,Phase=Phase 1
0,NCT00000143,Johns Hopkins Bloomberg School of Public Health,OTHER,May 1997,"[Cytomegalovirus Retinitis, HIV Infections]","[Virus Diseases, Retinal Diseases, Eye Disease...","[BC01, All, BC02, BC20, BC11, Rare]","[D000017726, D000012173]",[Johns Hopkins Bloomberg School of Public Health],[OTHER],[],[],[Inclusion criteria:\n\nAge 13 years or older\...,61,Actual,No,All,"[Child, Adult, Older Adult]",13 Years,,"[Department of Ophthalmology, University of Ca...","[Irvine, La Jolla, Los Angeles, Los Angeles, S...","[California, California, California, Californi...","[92697-4375, 92093-0946, 90033, 90095-7003, 94...","[United States, United States, United States, ...","[Device, Drug]","[Ganciclovir implant and oral ganciclovir, Cid...",,,3 years,[],[Ganciclovir device and oral dose of Ganciclov...,"[Ganciclovir Implant and Oral Ganciclovir, Cid...","[STARTED, COMPLETED, NOT COMPLETED]",[Overall Study],June 1997,[Ganciclovir device and oral dose of Ganciclov...,[Device: Ganciclovir implant and oral ganciclo...,"[Ganciclovir implant and oral ganciclovir, Cid...","[Experimental, Experimental]","[<=18 years, Between 18 and 65 years, >=65 yea...",[United States],"[BG000, BG001, BG002]",[Ganciclovir device and oral dose of Ganciclov...,"[Ganciclovir Implant and Oral Ganciclovir, Cid...",[],"[Age, Categorical, Sex: Female, Male, Region o...","[Participants, Participants, participants]",Randomized,Parallel Assignment,Treatment,[],37,"[2114.408936, 1189.4198, 2287.767578, 3559.750...",1.0,0.0,0.0
1,NCT00000170,Jaeb Center for Health Research,OTHER,April 1999,[Amblyopia],"[Brain Diseases, Central Nervous System Diseas...","[BC10, BC11, BC23, All]",[D000000550],[Jaeb Center for Health Research],[OTHER],[National Eye Institute (NEI)],[NIH],[Inclusion Criteria:\n\nPatients must be 7 yea...,419,Actual,No,All,[Child],,6 Years,[Wilmer Eye Institute],[Baltimore],[Maryland],[21287-9028],[United States],"[Drug, Device]","[Atropine, Eye Patch]",,,,[],[],[],[],[],,[Atropine],"[Device: Eye Patch, Drug: Atropine]","[Patching, Atropine]","[Active Comparator, Active Comparator]",[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[Amblyopia, patching, atropine]",172,[1717.821167],1.0,0.0,0.0
2,NCT00000177,National Institute on Aging (NIA),NIH,October 1995,[Alzheimer Disease],"[Dementia, Brain Diseases, Central Nervous Sys...","[BC10, BXM, All, Rare]",[D000000544],[National Institute on Aging (NIA)],[NIH],[],[],[Inclusion Criteria:\n\nWomen with a diagnosis...,120,,No,Female,"[Adult, Older Adult]",60 Years,,"[University of Alabama, Birmingham, University...","[Birmingham, San Diego, Jacksonville, Tampa, A...","[Alabama, California, Florida, Florida, Georgi...","[35294-0017, 92093, 32225, 30329, 60612, 62702...","[United States, United States, United States, ...",[Drug],[Estrogen],,,,[],[],[],[],[],,[],[],[],[],[],[],[],[],[],[],[],[],Randomized,,Treatment,"[Alzheimer's disease, Estrogen]",39,"[230.015213, 1189.4198, 567.188477, 494.567383...",1.0,0.0,0.0
15,NCT00000271,New York State Psychiatric Institute,OTHER,January 1995,"[Cocaine-Related Disorders, Substance-Related ...","[Pathologic Processes, Chemically-Induced Diso...","[BXM, All, BC25]","[D000004194, D000019966, D000019970]",[New York State Psychiatric Institute],[OTHER],[National Institute on Drug Abuse (NIDA)],[NIH],[Inclusion:\n\nMeets DSM-IV criteria for curre...,111,Actual,No,All,[Adult],18 Years,60 Years,[Research Foundation for Mental Hygiene],[New York],[New York],[10032],[United States],"[Drug, Drug]","[Desipramine, Placebo]",No,Yes,,[],[],[],[],[],,"[Participants were treated with desipramine, u...","[Drug: Desipramine, Drug: Placebo]","[Desipramine, Placebo]","[Experimental, Placebo Comparator]",[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[Desipramine, cocaine, depression]",93,[241.777069],0.0,1.0,0.0
16,NCT00000273,New York State Psychiatric Institute,OTHER,August 1995,"[Heroin Dependence, Opioid-Related Disorders, ...","[Pathologic Processes, Chemically-Induced Diso...","[BC25, BXM, All]","[D000004194, D000019966, D000009293, D000006556]",[New York State Psychiatric Institute],[OTHER],[National Institute on Drug Abuse (NIDA)],[NIH],[Inclusion Criterion\n\nDSM IV criteria for op...,8,Actual,No,All,[Adult],18 Years,45 Years,"[Columbia University, New York State Psychiatr...","[New York, New York]","[New York, New York]","[10032, 10032]","[United States, United States]",[Drug],[opiates],No,Yes,,[],[],[],[],[],,[Opiate-dependent individuals who were current...,[Drug: opiates],[Opiates],[Experimental],[],[],[],[],[],[],[],[],,Single Group Assignment,Treatment,"[heroin, opioid disorders, substance related d...",123,"[7901.219238, 7901.219238]",0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,NCT00023777,Southwest Oncology Group,NETWORK,August 2001,[Leukemia],"[Neoplasms by Histologic Type, Neoplasms]","[BC04, All, BC15, Rare]","[D000007938, D000007951, D000015470]",[Southwest Oncology Group],[NETWORK],[National Cancer Institute (NCI)],[NIH],[DISEASE CHARACTERISTICS:\n\nHistologically co...,71,Actual,No,All,"[Adult, Older Adult]",56 Years,,"[MBCCOP - Gulf Coast, CCOP - Greater Phoenix, ...","[Mobile, Phoenix, Phoenix, Tucson, Tucson, Lit...","[Alabama, Arizona, Arizona, Arizona, Arizona, ...","[36688, 85006-2726, 85012, 85723, 85724, 72205...","[United States, United States, United States, ...","[Biological, Biological, Drug, Drug]","[filgrastim, sargramostim, cytarabine, daunoru...",,,,[],[],[],[],[],,[],[],[],[],[],[],[],[],[],[],[],[],Non-Randomized,Single Group Assignment,Treatment,"[untreated adult acute myeloid leukemia, adult...",80,"[106.749603, 415.544464, 707.451721, 554.12750...",0.0,1.0,0.0
996,NCT00023829,Radiation Therapy Oncology Group,NETWORK,August 2001,[Prostate Cancer],"[Genital Neoplasms, Male, Urogenital Neoplasms...","[BC04, BXS, All]",[D000011471],[Radiation Therapy Oncology Group],[NETWORK],"[National Cancer Institute (NCI), NCIC Clinica...","[NIH, NETWORK]",[DISEASE CHARACTERISTICS:\n\nHistologically co...,67,Actual,No,Male,"[Child, Adult, Older Adult]",,,[Toronto Sunnybrook Regional Cancer Centre],[Toronto],[Ontario],[M4N 3M5],[Canada],"[Drug, Drug, Drug, Procedure, Radiation]","[bicalutamide, flutamide, releasing hormone ag...",,,,[],[],[],[],[],,[Luteinizing hormone-releasing hormone (LH-RH)...,"[Drug: bicalutamide, Drug: flutamide, Drug: re...","[LH-RH agonist plus radiation therapy, Radiati...","[Experimental, Active Comparator, Active Compa...",[],[],[],[],[],[],[],[],Randomized,Parallel Assignment,Treatment,"[stage III prostate cancer, stage IIB prostate...",34,[5237.469727],1.0,0.0,0.0
997,NCT00023998,National Cancer Institute (NCI),NIH,July 2001,[Metastatic Osteosarcoma],"[Neoplasms, Bone Tissue, Neoplasms, Connective...","[BC04, All, BC17, Rare]",[D000012516],[National Cancer Institute (NCI)],[NIH],[],[],[Inclusion Criteria:\n\nHistologically confirm...,80,Actual,No,All,"[Child, Adult]",,30 Years,[Children's Oncology Group],[Arcadia],[California],[91006-3776],[United States],"[Drug, Drug, Drug, Drug, Biological, Procedure...","[doxorubicin hydrochloride, cisplatin, methotr...",,,,[],[],[],[],[],,[See detailed description.],"[Drug: doxorubicin hydrochloride, Drug: cispla...",[Treatment (combination chemotherapy)],[Experimental],[],[],[],[],[],[],[],[],,Single Group Assignment,Treatment,[],70,[1168.816162],0.0,1.0,0.0
998,NCT00024102,Alliance for Clinical Trials in Oncology,OTHER,September 2001,[Breast Cancer],"[Neoplasms by Site, Neoplasms, Breast Diseases...","[BC04, BC17, All]",[D000001943],[Alliance for Clinical Trials in Oncology],[OTHER],"[National Cancer Institute (NCI), NCIC Clinica...","[NIH, NETWORK]","[Patients with operable, histologically confir...",633,Actual,No,Female,[Older Adult],65 Years,,[University of Alabama at Birmingham Comprehen...,"[Birmingham, Mobile, Scottsdale, Tucson, Hot S...","[Alabama, Alabama, Arizona, Arizona, Arkansas,...","[35294, 36652-2144, 85259-5499, 85724-5024, 71...","[United States, United States, United States, ...","[Drug, Drug]","[Standard Treatment, capecitabine]",,,Reported at the end of each cycle while patien...,[],[Patient/Physician choice of:\n\nCMF: cyclopho...,"[Standard Chemotherapy, Capecitabine]","[STARTED, COMPLETED, NOT COMPLETED]",[Overall Study],This was an intergroup study led by the CALGB....,[Patient/Physician choice of cyclophosphamide ...,"[Drug: Standard Treatment, Drug: capecitabine]","[Standard Chemotherapy, Capecitabine]","[Active Comparator, Experimental]","[Female, Male]","[65-69 years, 70-79 years, >=80 years, United ...","[BG000, BG001, BG002]",[Patient/Physician choice of:\n\nCMF: cyclopho...,"[Standard Chemotherapy, Capecitabine, Total]",[],"[Age, Customized, Sex: Female, Male, Region of...","[participants, Participants, participants, par...",Randomized,Parallel Assignment,Treatment,"[stage I breast cancer, stage II breast cancer...",134,"[230.015213, 496.689728, 354.67746, 290.867279...",1.0,0.0,0.0


NCTId                             object
OrgFullName                       object
OrgClass                          object
StartDate                         object
Condition                         object
ConditionAncestorTerm             object
ConditionBrowseBranchAbbrev       object
ConditionMeshId                   object
LeadSponsorName                   object
LeadSponsorClass                  object
CollaboratorName                  object
CollaboratorClass                 object
EligibilityCriteria               object
EnrollmentCount                    int64
EnrollmentType                    object
HealthyVolunteers                 object
Gender                            object
StdAge                            object
MinimumAge                        object
MaximumAge                        object
LocationFacility                  object
LocationCity                      object
LocationState                     object
LocationZip                       object
LocationCountry 

KeyError: ignored

##Transformers to count number of distinct values



In [None]:
class DistinctCounter(TransformerMixin, BaseEstimator):
    """ Creates new features with the number of items of features containing a list. "EligibilityCriteria" is an exception where the number of characters is counted.

    Parameters
    --------

    Attributes
    --------

    Notes
    --------
    
    """
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = X.copy()
        for feature in X.columns:
            if feature == "EligibilityCriteria":
                values = []
                for index, row in X.iterrows():
                    if len(row.EligibilityCriteria): values.append(len(row.EligibilityCriteria[0]))
                    else: values.append(0)
                X_new["#EligiCriteria"] = values
            else:
                name = f"#Diff{feature}"
                values = []
                for index, row in X.iterrows():
                    if len(row[feature]): values.append(len(row[feature]))
                    else: values.append(0)
                X_new[name] = values
        return X_new

In [None]:
TO_COUNT_FEATS = ['Condition', 'ConditionAncestorTerm', 'EligibilityCriteria', 'LocationFacility', 'LocationCity', 'LocationCountry', 'ArmGroupLabel']

display(df_raw[TO_COUNT_FEATS].head(10))

pipeline = Pipeline([
            ('extract', FeatureSelector(TO_COUNT_FEATS)),
            ('counter', DistinctCounter())
        ])

pipeline.fit_transform(df_raw[TO_COUNT_FEATS]).head(10)

Unnamed: 0,Condition,ConditionAncestorTerm,EligibilityCriteria,LocationFacility,LocationCity,LocationCountry,ArmGroupLabel
0,"[Cytomegalovirus Retinitis, HIV Infections]","[Virus Diseases, Retinal Diseases, Eye Disease...",[Inclusion criteria:\n\nAge 13 years or older\...,"[Department of Ophthalmology, University of Ca...","[Irvine, La Jolla, Los Angeles, Los Angeles, S...","[United States, United States, United States, ...","[Ganciclovir implant and oral ganciclovir, Cid..."
1,[Amblyopia],"[Brain Diseases, Central Nervous System Diseas...",[Inclusion Criteria:\n\nPatients must be 7 yea...,[Wilmer Eye Institute],[Baltimore],[United States],"[Patching, Atropine]"
2,[Alzheimer Disease],"[Dementia, Brain Diseases, Central Nervous Sys...",[Inclusion Criteria:\n\nWomen with a diagnosis...,"[University of Alabama, Birmingham, University...","[Birmingham, San Diego, Jacksonville, Tampa, A...","[United States, United States, United States, ...",[]
15,"[Cocaine-Related Disorders, Substance-Related ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion:\n\nMeets DSM-IV criteria for curre...,[Research Foundation for Mental Hygiene],[New York],[United States],"[Desipramine, Placebo]"
16,"[Heroin Dependence, Opioid-Related Disorders, ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion Criterion\n\nDSM IV criteria for op...,"[Columbia University, New York State Psychiatr...","[New York, New York]","[United States, United States]",[Opiates]
19,[Opioid-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...","[Inclusion Criteria:\n\nMales/Females, ages 21...",[Friends Research Institute],[Los Angeles],[United States],"[buprenorphine, buprenorphine and ultra-low do..."
21,"[Cocaine-Related Disorders, Opioid-Related Dis...","[Pathologic Processes, Substance-Related Disor...",[Inclusion Criteria- Subject must:\n\nExhibit ...,[University of Texas Health Science Center],[Houston],[United States],[]
24,"[Cocaine-Related Disorders, Substance-Related ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion:\n\ngood standing at methadone main...,[NYS Psychiatric Institute],[New York],[United States],"[PLacebo, Risperidone]"
26,[Opioid-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...",[Inclusion Criteria:\n\nM/F ages 18-65. Meet D...,[Friends Research Institute],[Los Angeles],[United States],"[1; liquid formulation, 2; tablet formulation]"
27,[Cocaine-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...",[Inclusion Criteria:\n\nCocaine dependent\n\nE...,[Johns Hopkins University School of Medicine],[Baltimore],[United States],[]


Unnamed: 0,Condition,ConditionAncestorTerm,EligibilityCriteria,LocationFacility,LocationCity,LocationCountry,ArmGroupLabel,#DiffCondition,#DiffConditionAncestorTerm,#EligiCriteria,#DiffLocationFacility,#DiffLocationCity,#DiffLocationCountry,#DiffArmGroupLabel
0,"[Cytomegalovirus Retinitis, HIV Infections]","[Virus Diseases, Retinal Diseases, Eye Disease...",[Inclusion criteria:\n\nAge 13 years or older\...,"[Department of Ophthalmology, University of Ca...","[Irvine, La Jolla, Los Angeles, Los Angeles, S...","[United States, United States, United States, ...","[Ganciclovir implant and oral ganciclovir, Cid...",2,8,1213,19,19,19,2
1,[Amblyopia],"[Brain Diseases, Central Nervous System Diseas...",[Inclusion Criteria:\n\nPatients must be 7 yea...,[Wilmer Eye Institute],[Baltimore],[United States],"[Patching, Atropine]",1,8,400,1,1,1,2
2,[Alzheimer Disease],"[Dementia, Brain Diseases, Central Nervous Sys...",[Inclusion Criteria:\n\nWomen with a diagnosis...,"[University of Alabama, Birmingham, University...","[Birmingham, San Diego, Jacksonville, Tampa, A...","[United States, United States, United States, ...",[],1,8,318,25,25,25,0
15,"[Cocaine-Related Disorders, Substance-Related ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion:\n\nMeets DSM-IV criteria for curre...,[Research Foundation for Mental Hygiene],[New York],[United States],"[Desipramine, Placebo]",2,3,1477,1,1,1,2
16,"[Heroin Dependence, Opioid-Related Disorders, ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion Criterion\n\nDSM IV criteria for op...,"[Columbia University, New York State Psychiatr...","[New York, New York]","[United States, United States]",[Opiates],3,3,903,2,2,2,1
19,[Opioid-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...","[Inclusion Criteria:\n\nMales/Females, ages 21...",[Friends Research Institute],[Los Angeles],[United States],"[buprenorphine, buprenorphine and ultra-low do...",1,3,475,1,1,1,2
21,"[Cocaine-Related Disorders, Opioid-Related Dis...","[Pathologic Processes, Substance-Related Disor...",[Inclusion Criteria- Subject must:\n\nExhibit ...,[University of Texas Health Science Center],[Houston],[United States],[],2,4,956,1,1,1,0
24,"[Cocaine-Related Disorders, Substance-Related ...","[Pathologic Processes, Chemically-Induced Diso...",[Inclusion:\n\ngood standing at methadone main...,[NYS Psychiatric Institute],[New York],[United States],"[PLacebo, Risperidone]",2,3,1097,1,1,1,2
26,[Opioid-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...",[Inclusion Criteria:\n\nM/F ages 18-65. Meet D...,[Friends Research Institute],[Los Angeles],[United States],"[1; liquid formulation, 2; tablet formulation]",1,3,422,1,1,1,2
27,[Cocaine-Related Disorders],"[Substance-Related Disorders, Chemically-Induc...",[Inclusion Criteria:\n\nCocaine dependent\n\nE...,[Johns Hopkins University School of Medicine],[Baltimore],[United States],[],1,3,205,1,1,1,0


##Transformers for patients distribution

In [None]:
class PatientsDistributionTransformer(TransformerMixin, BaseEstimator):
    """ Distributes the number of patients over number of facilities and number of countries.

    Parameters
    --------

    Attributes
    --------

    Notes
    --------
    
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy()
        features = ['LocationFacility', 'LocationCountry']
        for feature in features:
            name = f"#Pts/#Diff{feature}"
            s = pd.Series(X['EnrollmentCount']/[len(set(value)) for value in X[feature]])
            X_new[name] = pd.to_numeric(s, errors='coerce').fillna(0, downcast='infer')
        return X_new

In [None]:
transformer = PatientsDistributionTransformer()
transformer.fit_transform(df_raw).head(10)

## Transformer integrating country and regional data

A combination transformer from all alternative ones

In [None]:
class LocationDataTransformer(TransformerMixin, BaseEstimator):
    """ This transformer contains all alternative transformers relating to location (CountryDataTransformer - CityPopulation - PerCountryTransformer - PerCityTransformer)
        transformer:
            totalCombine: a combination between CountryDataTransformer and CityPopulationTransformer (Basically just add one more feature to CountryDataTransformer) => default
            perCity: divide a study into multiple rows, equivalent to list of cities in the study
            perCountry: divide a study into multiple rows, equaivalent to list of country in the study
        strategy:
            simple = just calculates the normal average of each country attribute of all different countries involved / average population density for each city => default
            weighted = calculates the average of each country attribute according to the number of occurences / average population density in the list of distinct cities
        mean:
            trials = uses the mean value of all trials to fill missing country values
            worldwide = uses the worldwide mean for this specific attribute
    """
    def __init__(self, country_data, transformer='totalCombine', strategy='simple', mean='trials', debug=False):
        self.transformer = transformer
        self.country_data = country_data.fillna(0)
        self.strategy = strategy
        self.mean = mean


    def fit(self, X, y=None):
        return self

    def most_frequent(self, List): # Returning array of most frequent elements in a list
        obj = {}
        arr = []
        for item in List:
            obj[item] = List.count(item)
        maxNum = obj[max(obj)]
        for key, value in obj.items():
            if value == maxNum:
                arr.append(key)
        return arr

    def transform(self, X):
        X_new = X.reset_index(drop=True)

        if self.transformer == 'totalCombine':           

            special = {
                'Former Yugoslavia': 'Serbia',
                'Former Serbia and Montenegro': 'Serbia',
                'Holy See (Vatican City State)': 'Italy',
                'Macedonia, The Former Yugoslav Republic of': 'North Macedonia',
                'Swaziland': 'Eswatini',
                'Netherlands Antilles': 'Curacao'
            }

            # creating columns for each country attribute
            columns = self.country_data.drop('countryName', axis=1).columns
            for attr in columns:
                X_new[f"avg_{attr}"], X_new[f"min_{attr}"], X_new[f"max_{attr}"], X_new[f"main_country_{attr}"] = None, None, None, None
                X_new[f"avg_{attr}"] = X_new[f"avg_{attr}"].astype(np.float64)
                X_new[f"min_{attr}"] = X_new[f"min_{attr}"].astype(np.float64)
                X_new[f"max_{attr}"] = X_new[f"max_{attr}"].astype(np.float64)
                X_new[f"main_country_{attr}"] = X_new[f"main_country_{attr}"].astype(np.float64)

            
            X_new['avg_city_population'], X_new['min_city_population'], X_new['max_city_population'] = 0, 0, 0
            for index, locationCity, locationCountry, locationPopulationDensity in zip(X_new.index, X_new['LocationCity'], X_new['LocationCountry'], X_new['LocationPopulationDensity']):

                # Add main country data
                if len(set(locationCountry)) == 1:
                    main_country = locationCountry[0]
                    for attr in columns:
                        if main_country in special.keys():
                            main_country = special[main_country]
                        X_new.at[index, f"main_country_{attr}"] = self.country_data[self.country_data.countryName == main_country][attr]
                        
                else: # If there are more than one country have highest number of facilities, calculate average of country features
                    mostFrequentCountry = self.most_frequent(locationCountry)
                    mainCountryDataTemp = 0
                    
                    for country in mostFrequentCountry:
                        if country in special.keys():
                            country = special[country]
                        for attr in columns:
                            mainCountryDataTemp += self.country_data[self.country_data.countryName == country][attr]
                            X_new.at[index, f"main_country_{attr}"] = mainCountryDataTemp / len(mostFrequentCountry)

                cityList = []
                for i in range(len(locationCity)):
                    cityList.append(f"{locationCity[i]} --- {locationCountry[i]}")

                if self.strategy == 'weighted':
                    tmp_countries = list(row['LocationCountry'])
                    tmp_cities = list(cityList) # Get list of involved city
                else:
                    tmp_countries = list(set(row['LocationCountry']))
                    tmp_cities = list(set(cityList)) # Get unique list of city
                    
                # iterate over cities and get density information
                density_sum = 0
                density_min, density_max = None, None
                for city in tmp_cities:
                    id = locationCity.index(city.split(" --- ")[0])
                    tmp_density = locationPopulationDensity[id]
                    density_sum += tmp_density
                    if density_min:
                        if density_min > tmp_density: density_min = tmp_density
                    else: density_min = tmp_density
                    if density_max:
                        if density_max < tmp_density: density_max = tmp_density
                    else: density_max = tmp_density

                X_new.at[index, "avg_city_population"] = density_sum/len(cityList)
                X_new.at[index, "min_city_population"] = density_min
                X_new.at[index, "max_city_population"] = density_max

                # iterate over countries and get country data
                data_avg, data_min, data_max = {}, {}, {}
                countries_not_found = []
                for country in tmp_countries:
                    if country in self.country_data.values:
                        tmp = self.country_data[self.country_data.countryName == country]
                        for attr in columns:
                            if attr in data_avg.keys(): data_avg[attr] += float(tmp[attr])
                            else: data_avg[attr] = float(tmp[attr])

                            if attr in data_min.keys():
                                if data_min[attr] > float(tmp[attr]): data_min[attr] = float(tmp[attr])
                            else: data_min[attr] = float(tmp[attr])

                            if attr in data_max.keys():
                                if data_max[attr] < float(tmp[attr]): data_max[attr] = float(tmp[attr])
                            else: data_max[attr] = float(tmp[attr])
                    elif country in special.keys():
                        tmp = self.country_data[self.country_data.countryName == special[country]]
                        for attr in columns:
                            if attr in data_avg.keys(): data_avg[attr] += float(tmp[attr])
                            else: data_avg[attr] = float(tmp[attr])

                            if attr in data_min.keys():
                                if data_min[attr] > float(tmp[attr]): data_min[attr] = float(tmp[attr])
                            else: data_min[attr] = float(tmp[attr])

                            if attr in data_max.keys():
                                if data_max[attr] < float(tmp[attr]): data_max[attr] = float(tmp[attr])
                            else: data_max[attr] = float(tmp[attr])
                    else:
                        countries_not_found.append(country)
                        tmp_countries.remove(country)

                for attr in data_avg.keys():
                    X_new.at[index, f"avg_{attr}"] = data_avg[attr]/len(tmp_countries)
                    X_new.at[index, f"min_{attr}"] = data_min[attr]
                    X_new.at[index, f"max_{attr}"] = data_max[attr]
                
            # fill empty values in other attributes
            for attr in data_avg.keys():
                #print(f"Avg {attr}: ", X_new[f"avg_{attr}"].isnull().sum())
                #print(f"Min {attr}: ", X_new[f"min_{attr}"].isnull().sum())
                #print(f"Max {attr}: ", X_new[f"max_{attr}"].isnull().sum())
                if self.mean == 'worldwide':
                    X_new[f"avg_{attr}"] = X_new[f"avg_{attr}"].fillna(self.country_data[attr].mean())
                    X_new[f"min_{attr}"] = X_new[f"min_{attr}"].fillna(self.country_data[attr].min())
                    X_new[f"max_{attr}"] = X_new[f"max_{attr}"].fillna(self.country_data[attr].max())
                    X_new[f"main_country_{attr}"] = X_new[f"avg_{attr}"].fillna(self.country_data[attr].mean())
                else:
                    X_new[f"avg_{attr}"] = X_new[f"avg_{attr}"].fillna(X_new[f'avg_{attr}'].mean())
                    X_new[f"avg_{attr}"] = X_new[f"avg_{attr}"].astype(np.float64)
                    X_new[f"min_{attr}"] = X_new[f"min_{attr}"].fillna(X_new[f'min_{attr}'].min())
                    X_new[f"min_{attr}"] = X_new[f"min_{attr}"].astype(np.float64)
                    X_new[f"max_{attr}"] = X_new[f"max_{attr}"].fillna(X_new[f'max_{attr}'].max())
                    X_new[f"max_{attr}"] = X_new[f"max_{attr}"].astype(np.float64)
                    X_new[f"main_country_{attr}"] = X_new[f"avg_{attr}"].fillna(X_new[f'avg_{attr}'].mean())
                    X_new[f"main_country_{attr}"] = X_new[f"avg_{attr}"].astype(np.float64)

            if len(countries_not_found): print("Countries not found: ", countries_not_found) # Print not found countries
        
        elif self.transformer == 'perCity':
            # Drop population country
            self.country_data.drop(columns=["population"], inplace = True)

            # Rename density to CountryDensity
            self.country_data.rename(columns={"density": "CountryDensity"}, inplace = True)

            result = []

            for index, row in X_new.iterrows():
                densitySum = 0
                cityList = []

                for i in range(len(row['LocationCity'])):
                    cityList.append(f"{row['LocationCity'][i]} --- {row['LocationCountry'][i]}")
                    densitySum += row['LocationPopulationDensity'][i]
                
                tmp_cities = list(set(cityList)) # Get unique list of city
                
                for city in tmp_cities:
                    id = row['LocationCity'].index(city.split(" --- ")[0])
                    tempCity = row.copy()

                    # Create new column only containing unique countries
                    tempCity['city'] = row['LocationCity'][id]
                    tempCity['country'] = row['LocationCountry'][id]
                    tempCity['CityPopulationDensity'] = row['LocationPopulationDensity'][id]

                    # Calculate Enrollment Count per city
                    cityCount = row['LocationCity'].count(tempCity['city'])
                    tempCity['EnrollmentCountPerCity'] = round(cityCount * tempCity['CityPopulationDensity'] * tempCity['EnrollmentCount'] / densitySum)
                    
                    del tempCity['LocationCity']
                    del tempCity['LocationCountry']
                    result.append(tempCity)

            df_per_city = pd.DataFrame(result)
            X_new = pd.merge(left=df_per_city, right=self.country_data, how='left', left_on='country', right_on='countryName')
            X_new.drop(columns=["countryName"], inplace = True)

        elif self.transformer == "perCountry":
            self._subset = ['LocationCountry', 'LocationCity', 'LocationFacility', 'LocationState', 'LocationZip', 'NCTId']

            # Calculate a factor for the worldshare of a country
            self.country_data['worldshareFactor'] = 0.0
            for index, row in self.country_data.iterrows():
                if row['worldshare'] < 0.01 or row['worldshare'] == 0.0:
                    self.country_data.at[index, 'worldshareFactor'] = 0.2
                if row['worldshare'] < 0.05 and row['worldshare']  >= 0.01:
                    self.country_data.at[index, 'worldshareFactor'] = 0.4
                if row['worldshare'] < 0.2 and row['worldshare']  >= 0.05:
                    self.country_data.at[index, 'worldshareFactor'] = 0.6
                if row['worldshare'] < 1 and row['worldshare']  >= 0.2:
                    self.country_data.at[index, 'worldshareFactor'] = 0.8
                if row['worldshare'] >= 1:
                    self.country_data.at[index, 'worldshareFactor'] = 1.0

            # Clean up data set for join
            self.country_data.rename(columns={"countryName": "country"}, inplace = True)
            dataSubset = X[self._subset]
            # Create new column only containing unique countries
            dataSubset['DistinctCountries_x'] = dataSubset['LocationCountry']

            for index, row in dataSubset.iterrows():
                if row['LocationCountry']:
                    dataSubset.at[index, 'DistinctCountries_x'] = reduce(lambda l, x: l if x in l else l+[x], row['LocationCountry'], [])
            else:
                dataSubset.at[index, 'DistinctCountries_x'] = []

            for index, row in dataSubset.iterrows():
                if isinstance(row['DistinctCountries_x'], str):
                    dataSubset.at[index, 'DistinctCountries_x'] = [row['DistinctCountries_x']]
            
            X_new = pd.DataFrame()
            X_new['facilities'] = ""
            X_new['LocationZips'] = ""
            X_new['LocationCities'] = ""
            counter = 0
            for index, row in dataSubset.iterrows():
                countries = row['DistinctCountries_x']
                for i in range(len(countries)):
                    X_new.at[counter, 'NCTId'] = row['NCTId']
                    X_new.at[counter, 'country'] = countries[i]
                    locationsCount = row['LocationCountry'].count(countries[i])
                    X_new.at[counter, 'locations'] = locationsCount
                    X_new.at[counter, 'facilities'] = row['LocationFacility'][0:locationsCount]
                    X_new.at[counter, 'LocationZips'] = row['LocationZip'][0:locationsCount]
                    X_new.at[counter, 'LocationCities'] = row['LocationCity'][0:locationsCount]
                    row['LocationFacility'] = row['LocationFacility'][locationsCount:]
                    row['LocationZip'] = row['LocationZip'][locationsCount:]
                    row['LocationCity'] = row['LocationCity'][locationsCount:]
                    counter = counter + 1
                    
            X_new = pd.merge(left=X_new, right=X, how='outer', on='NCTId')
                    
            # Clean country data
            X_new.loc[X_new['country'] == 'Former Yugoslavia', 'country'] = 'Serbia'
            X_new.loc[X_new['country'] == 'Former Serbia and Montenegro', 'country'] = 'Serbia'
            X_new.loc[X_new['country'] == 'Holy See (Vatican City State)', 'country'] = 'Italy'
            X_new.loc[X_new['country'] == 'Macedonia, The Former Yugoslav Republic of', 'country'] = 'North Macedonia'
            X_new.loc[X_new['country'] == 'Swaziland', 'country'] = 'Eswatini'
            X_new.loc[X_new['country'] == 'Netherlands Antilles', 'country'] = 'Curacao'
                    
            # Join with country data
            X_new = pd.merge(left=X_new, right=self.country_data, how='left', left_on='country', right_on='country')
            
            # Calculate per country Enrollment Count
            self._subset = self._subset + ['worldshareFactor', 'locations', 'EnrollmentCount', 'country']
            dataSubset = X_new[self._subset]
            dataSubset = dataSubset[dataSubset['country'].notna()]

            for index, row in dataSubset.iterrows():
                dataSubset.at[index, 'temp'] = row['locations'] * row['worldshareFactor']
        
            dataSubset['totalTemp'] = dataSubset.groupby('NCTId', sort=False)["temp"].transform('sum')

            # Calculate enrollment distribution 
            for index, row in dataSubset.iterrows():
                countryenrollment = round(((row['locations'] *  row['worldshareFactor']) /  row['totalTemp']) * row['EnrollmentCount'])
                if countryenrollment == 0:
                    countryenrollment = 1
                dataSubset.at[index, 'enrollmentPerCountry'] = countryenrollment
                dataSubset.at[index, 'enrollmentPercentage'] = ((row['locations'] *  row['worldshareFactor']) /  row['totalTemp'])* 100
        
        
            dataSubset['EnrollmentCheck'] = dataSubset.groupby('NCTId', sort=False)["enrollmentPerCountry"].transform('sum')
            dataSubset['HighestEnrollment'] = dataSubset.groupby('NCTId', sort=False)["enrollmentPerCountry"].transform('max')

            for index, row in dataSubset.iterrows():
                if row['EnrollmentCheck'] > row['EnrollmentCount'] and row['HighestEnrollment'] == row['enrollmentPerCountry']:
                    difference = row['EnrollmentCheck'] - row['EnrollmentCount']
                    dataSubset.at[index, 'enrollmentPerCountry'] = row['enrollmentPerCountry'] - difference

            dataSubset.drop(columns = ['worldshareFactor', 'locations','temp', 'totalTemp', 'EnrollmentCheck', 'HighestEnrollment'], inplace = True)

            # Join with rest of the data
            X_new = pd.merge(left=X_new, right=dataSubset, how='left', on=['NCTId', 'country'])

        return X_new

In [None]:
# test
transformer = LocationDataTransformer(df_dbcountry, transformer='totalCombine', strategy='weighted', mean='worldwide')
test = transformer.fit_transform(df_raw[['LocationCountry', 'LocationCity', 'LocationFacility', 'LocationState', 'LocationZip', 'NCTId', 'LocationPopulationDensity']]).tail(10)
test[['main_country_population', 'main_country_lifeExpectancy', 'main_country_GDP', 'main_country_density', 'main_country_fertilityRate', 'main_country_medianAge', 'main_country_migrantsNet', 'main_country_sizeInKm2', 'main_country_urbanPopulation', 'main_country_unemploymentRate', 'main_country_hospitalBed', 'main_country_healthExpenditure']]

##Transformers for free text values

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import nltk
nltk.download('stopwords')
import re, string, timeit

#Custom transformer that deals with textual features by extracting keywords
class TextualFeatureTransformer( TransformerMixin, BaseEstimator ):
    """
    Class for performing the feature transformation for textual features. 
    After performing several NLP preprocessing steps, the n most frequently mentioned keywords are identified 
    and a one-hot coding is performed for them. 

    Parameters
    --------

        remove : array of strings
            Array of strings defining the NLP preprocessing steps that should be carried out 
            before the keyword extraction can take place.
            You can choose from the following steps: 
                - 'upper'
                - 'numbers'
                - 'special'
                
            By default, all of them are applied
        n_keywords : integer
            Integer defining the number of keywords that should be extracted
        stop_words : list or set
            List of stopwords that are to be eliminated prior to the keyword extraction.
        

    Attributes
    --------
    
    features : dict
        Dictionary of the feature subset that is to be transformed
    top_keywords: dict
        Dictionary to store the top n identified keywords of each feature
    df_transformedFeatures : dataframe
        Dataframe of the transformed features to apply one hot encoding on with regard to the preprocessed entries
     """
    
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, 
                 remove = ['upper', 'numbers', 'special'],
                 n_keywords = 20, 
                 stop_words = set(stopwords.words('english'))):
        self._remove = remove
        self._n_keywords = n_keywords
        self._stop_words = stop_words
        self.features = {}
        self.top_keywords = {}
        self.df_transformedFeatures = pd.DataFrame()

    #Helper function to remove numbers 
    def remove_numbers(self, list):
        """ Removes all numbers in the list """
        pattern = '[0-9]'
        list = [re.sub(pattern, '', i) for i in list] 
        return list

    #Helper function to remove special characters
    def remove_special(self, list): 
        """ Removes all special characters in the list """
        pattern = '[^A-Za-z0-9]+'
        list = [re.sub(pattern, ' ', i) for i in list] 
        return list
    
    #Helper function to remove uppercase letter and replace with lower case
    def remove_upper(self, obj):
        """ Converts all letters to lower case in the list """
        obj = list(map(str.lower, obj))
        return obj
    
    #Helper function to create one hot encoding
    def one_hot (self, row, entry, attr):
        if str(entry) in row[attr]:
            return 1
        else:
            return 0
        
    def fit( self, X, y = None ):
        # Apply NLP preprocessing
        self.features = X.to_dict()
        for attr in X.columns:
            if isinstance(X[attr][0], list):    
                for spec in self._remove:
                    for k, v in self.features[attr].items():
                        exec("self.features['{}'][{}] = self.remove_{}({})".format(attr, k, spec, v ) )
            else:
                for spec in self._remove:
                    for k, v in self.features[attr].items():
                        if self.features[attr][k]:
                            if spec == 'upper':
                                self.features[attr][k] = v.lower()
                            if spec == 'numbers':
                                self.features[attr][k] = ''.join([i for i in self.features[attr][k] if not i.isdigit()])
                            if spec == 'special':
                                exclude = set(string.punctuation)
                                self.features[attr][k] = ''.join(ch for ch in self.features[attr][k] if ch not in exclude)
            
            # Convert list of strings or strings into list of tokens consisting of only one word
            for k, v in self.features[attr].items():
                tokens = []
                if isinstance(v, list):
                    for entry in v:
                        partial = entry.split()
                        tokens = tokens + partial
                else:
                    tokens = str(v).split(' ')
                tokens = [w for w in tokens if not w in self._stop_words]
                self.features[attr][k] = tokens
        
        # extract keywords
        self.df_transformedFeatures = pd.DataFrame.from_dict(self.features)
        for attr in self.df_transformedFeatures.columns:
            self.top_keywords[attr] = self.df_transformedFeatures[attr].explode().value_counts()
            self.top_keywords[attr].drop(['mg', 'g', '&', 'kg', 'b', None, 'None', ' ', ''], 
                                         inplace = True, 
                                         errors='ignore')
            self.top_keywords[attr] = self.top_keywords[attr].nlargest(20)
        #print(self.top_keywords)
        return self

    def transform(self, X , y = None ):
        X_new = self.df_transformedFeatures.copy()
        # Apply one hot according to keywords                       
        for attr in X.columns:
            for entry in self.top_keywords[attr].index:
                columnName = attr + '_' + str(entry)
                X_new[columnName] = X_new.apply (lambda row: self.one_hot(row, entry, attr), axis=1)

        X_new.drop(columns = attr)
        return X_new

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# # test
textual_features = ['InterventionName', 'OrgFullName', 'LeadSponsorName', 'CollaboratorName', 'EventsTimeFrame', 'FlowDropWithdrawType', 
                    'FlowGroupDescription', 'FlowGroupTitle', 'FlowMilestoneType', 'FlowPeriodTitle', 'FlowRecruitmentDetails', 'ArmGroupDescription', 
                    'ArmGroupInterventionName', 'ArmGroupLabel', 'BaselineCategoryTitle', 'BaselineClassTitle', 'BaselineGroupDescription', 'BaselineGroupTitle', 
                    'BaselineMeasureTitle', 'BaselineMeasureUnitOfMeasure']

# df_test = df_raw[textual_features].copy()
# # display(df_test)
# # display(df_test.dtypes)

# instance = TextualFeatureTransformer()
# new = instance.transform(df_test)
# new.head(10)

In [None]:
# pipeline
#textual_features = ['InterventionName', 'OrgFullName']

textual_pipeline = Pipeline( steps = [( 'text_transformer', TextualFeatureTransformer())])

textual_pipeline.fit_transform(df_raw[textual_features]).head(10)

##Regional Age Structure

In [None]:
class RegionalAgeTransformer( BaseEstimator, TransformerMixin ):
    '''
    Transformer that matches the Location and required Age with a data set that contains absolute population numbers for the age groups "Youth", "Working" and "Elderly" in different regions.
    Required Columns: MinimumAge, MaximumAge, LocationCity, LocationState
    Output Column: AvgLocalAge
    '''

    def __init__( self, countrylist, df_regional, debug=False):
        self._debug = debug
        self._countrylist = countrylist
        self._df_regional = df_regional

    def fit( self, X, y = None ):
        return self 
    
    def transform(self, X , y = None ):

        X_new = pd.DataFrame(X)
        
        #Fill empty values with minimal and maximal age
        X_new["MinimumAge"].loc[X_new["MinimumAge"].isna()==True]="0 Years"
        X_new["MaximumAge"].loc[X_new["MaximumAge"].isna()==True]="100 Years"
        
        #Define Conversion Factors
        conversion_factors= {
                                'Years': 1,
                                'Year': 1,
                                'Months': 1/12,
                                'Month': 1/12,
                                'Weeks': 1/52,
                                'Week': 1/52,
                                'Days': 1/365 ,
                                'Day': 1/365,
                                'Hours': 1/8760,
                                'Hour': 1/8760
                                }
        
        #Convert time string to float: Minimum Age
        for i in range(len(X_new["MinimumAge"])):
                    for string, factor in conversion_factors.items():   
                        if string in (X_new["MinimumAge"].iloc[i]):
                            X_new["MinimumAge"].iloc[i] = (X_new["MinimumAge"].iloc[i]).replace(string, '')
                            X_new["MinimumAge"].iloc[i] = (X_new["MinimumAge"].iloc[i]).strip()
                            X_new["MinimumAge"].iloc[i] = float(X_new["MinimumAge"].iloc[i]) * factor
                            X_new["MinimumAge"].iloc[i] = str(X_new["MinimumAge"].iloc[i] )
        X_new["MinimumAge"] = X_new["MinimumAge"].astype(float)

        #Convert time string to float: Maximum Age
        for i in range(len(X_new["MaximumAge"])):
                    for string, factor in conversion_factors.items():   
                        if string in (X_new["MaximumAge"].iloc[i]):
                            X_new["MaximumAge"].iloc[i] = (X_new["MaximumAge"].iloc[i]).replace(string, '')
                            X_new["MaximumAge"].iloc[i] = (X_new["MaximumAge"].iloc[i]).strip()
                            X_new["MaximumAge"].iloc[i] = float(X_new["MaximumAge"].iloc[i]) * factor
                            X_new["MaximumAge"].iloc[i] = str(X_new["MaximumAge"].iloc[i] )          
        X_new["MaximumAge"] = X_new["MaximumAge"].astype(float)
        
        #Create Age column
        X_new["Age"]=np.empty((len(X_new), 0)).tolist()
        for i, row in X_new.iterrows():
            if row["MinimumAge"] <= 16:
                row["Age"].append("Youth")
            if row["MinimumAge"] < 65 and row["MaximumAge"] > 16:
                row["Age"].append("Working")
            if row["MaximumAge"] >= 65:
                row["Age"].append("Elderly")  

        #Add additional local attributes to dataframe
        X_new["LocalAge"]=np.empty((len(X_new), 0)).tolist()
        X_new["AvgLocalAge"]=0

        #Check wheter city is in regional data and match with Age, if not match with State
        for i, row in X_new.iterrows():
            for j,city in enumerate(row["LocationCity"]):
                if city in self._countrylist.unique():
                    if "Youth" in row["Age"]:
                        row["LocalAge"].append(int(self._df_regional["Youth"].loc[self._df_regional["Area"]==city]))
                    if "Working" in row["Age"]:
                        row["LocalAge"].append(int(self._df_regional["Working"].loc[self._df_regional["Area"]==city]))
                    if "Elderly" in row["Age"]:
                        row["LocalAge"].append(int(self._df_regional["Elderly"].loc[self._df_regional["Area"]==city]))
                else:
                    if len(row["LocationCity"])==len(row["LocationState"]):
                        if row["LocationState"][j] in self._countrylist.unique():
                            if "Youth" in row["Age"]:
                                row["LocalAge"].append(int(self._df_regional["Youth"].loc[self._df_regional["Area"]==row["LocationState"][j]]))
                            if "Working" in row["Age"]:
                                row["LocalAge"].append(int(self._df_regional["Working"].loc[self._df_regional["Area"]==row["LocationState"][j]]))
                            if "Elderly" in row["Age"]:
                                row["LocalAge"].append(int(self._df_regional["Elderly"].loc[self._df_regional["Area"]==row["LocationState"][j]]))
            
            #Take the sum of all matching population
            if len(X_new["LocalAge"][i])>0:
                X_new["AvgLocalAge"][i]=np.sum(X_new["LocalAge"][i])    # TODO throws a warning
        
        #Drop all unneccessary features
        todropfeatures = ["LocationCity", "LocationState", "MinimumAge", "MaximumAge", "Age", "LocalAge"]
        X_new=X_new.drop(todropfeatures, axis=1)

        return X_new
        

In [None]:
#Test
regional_age_Features = ['LocationCity', 'LocationState', "MinimumAge", "MaximumAge"]

df_test = df_raw[regional_age_Features].copy()[:100]
display(df_test)
display(df_test.dtypes)

Unnamed: 0,LocationCity,LocationState,MinimumAge,MaximumAge
0,"[Irvine, La Jolla, Los Angeles, Los Angeles, S...","[California, California, California, Californi...",13 Years,
1,[Baltimore],[Maryland],,6 Years
2,"[Birmingham, San Diego, Jacksonville, Tampa, A...","[Alabama, California, Florida, Florida, Georgi...",60 Years,
15,[New York],[New York],18 Years,60 Years
16,"[New York, New York]","[New York, New York]",18 Years,45 Years
...,...,...,...,...
113,[Bethesda],[Maryland],,
114,[Bethesda],[Maryland],,
115,[Bethesda],[Maryland],18 Years,80 Years
116,[Bethesda],[Maryland],,


LocationCity     object
LocationState    object
MinimumAge       object
MaximumAge       object
dtype: object

In [None]:
instance = RegionalAgeTransformer()
df_new = instance.transform(df_test)
df_new.head(10)


##Transformer to Assign Worldrank to Facility

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
class WorldRankTransformer( BaseEstimator, TransformerMixin ):
    """
    Class that matches the LocationFacility with a data set that ranks Universitites and Hospitals 
    by their research activities.

    Parameters
    --------

        worldrankData : dataframe
            A dataframe containing information about the worldrank of the facilities.
            
        stopwords: set
            Set of stopwords that should be excluded when preprocessing the facility names.
        
        debug: boolean
            A boolean value defining whether or not the transformer execution should be debugged.

    Notes
    --------
        - Required columns = LocationFacility
        - Required information = Worldrank data set
        - Output columns = AvgFacilityRank for all Facilities involved
        
        """


    def __init__( self, 
                 worldrankData = pd.DataFrame(),
                 stopwords = set(stopwords.words('english')),
                 debug=False):
        self._worldrankData = worldrankData.copy()
        self._debug = debug
        self._stopwords = stopwords

    def fit( self, X, y = None ):
        return self 
    
    #Define Preprocessing Function
    def clean_string(self, text):
        text = ''.join([word for word in text if word not in string.punctuation])
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in self._stopwords])
        return text
    
    def transform(self, X , y = None ):
        #Copy df
        X_new = pd.DataFrame(X)
        
        #Create emtpy list columns
        X_new["FacilityWorldRank"]=np.empty((len(X_new), 0)).tolist()
        X_new["floatFacilityWorldRank"]=np.empty((len(X_new), 0)).tolist()
        X_new["CleanFacility"]=np.empty((len(X_new), 0)).tolist()
        X_new["AvgFacilityRank"]=0
        

        for index, row in X_new.iterrows():
            rowFacilities = []

            #Preprocess data
            for fac in row["LocationFacility"]:
                row["CleanFacility"].append(self.clean_string(fac))

            #Match facilities with worldrank data set
            for element in row['CleanFacility']:
                for fac in self._worldrankData.index:
                    if element == fac:
                        rowFacilities.append(self._worldrankData.loc[fac]['WorldRank'])
            X_new.at[index, 'FacilityWorldRank'] = rowFacilities
            
        # Calculate the mean world rank of all facilities a worldrank could be identified for
        for i, row in X_new.iterrows():
            if row["FacilityWorldRank"]:
                X_new.at[i, "AvgFacilityRank"] = sum(row["FacilityWorldRank"]) / len(row["FacilityWorldRank"])
            
        
        # Duplicated 'LocationFacility'
        X_new.drop(columns=['LocationFacility', 'floatFacilityWorldRank', 'CleanFacility', 'FacilityWorldRank'], inplace = True)

        return X_new

In [None]:
#Test
hospital_features = ["LocationFacility"]

df_test = df_raw[hospital_features].copy()
display(df_test)
display(df_test.dtypes)

nltk.download('stopwords')
stop = stopwords.words('english')
instance = WorldRankTransformer(worldrankData = df_worldrank, stopwords = stop)
df_new = instance.transform(df_test)
df_new.head(50)

Unnamed: 0,LocationFacility
0,"[Department of Ophthalmology, University of Ca..."
1,[Wilmer Eye Institute]
2,"[University of Alabama, Birmingham, University..."
15,[Research Foundation for Mental Hygiene]
16,"[Columbia University, New York State Psychiatr..."
...,...
995,"[MBCCOP - Gulf Coast, CCOP - Greater Phoenix, ..."
996,[Toronto Sunnybrook Regional Cancer Centre]
997,[Children's Oncology Group]
998,[University of Alabama at Birmingham Comprehen...


LocationFacility    object
dtype: object

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,AvgFacilityRank
0,26
1,0
2,945
15,0
16,0
19,0
21,0
24,0
26,0
27,0


##Transformer for MeshID

In [None]:
class MeshIDTransformer( TransformerMixin, BaseEstimator ):
    def __init__( self, debug=False):
        self._debug = debug

    def fit( self, X, y = None ):
        return self 
    
    def transform(self, X , y = None ):
        #X_new = X.copy()
        X_new = pd.DataFrame(X)
        
        X_new["MeshID1"]=0
        X_new["MeshID2"]=0
        X_new["MeshID3"]=0

        for i, row in X_new.iterrows():
            if len(X_new["ConditionMeshId"][i])==1:
                X_new.at[i, "MeshID1"] = int(X_new["ConditionMeshId"][i][0].strip("D"))
            if len(X_new["ConditionMeshId"][i])==2:
                X_new.at[i, "MeshID1"] = int(X_new["ConditionMeshId"][i][0].strip("D"))
                X_new.at[i, "MeshID2"] = int(X_new["ConditionMeshId"][i][1].strip("D"))
            if len(X_new["ConditionMeshId"][i])==3:
                X_new.at[i, "MeshID1"] = int(X_new["ConditionMeshId"][i][0].strip("D"))
                X_new.at[i, "MeshID2"] = int(X_new["ConditionMeshId"][i][1].strip("D"))
                X_new.at[i, "MeshID3"] = int(X_new["ConditionMeshId"][i][2].strip("D"))
        
        X_new = X_new.drop("ConditionMeshId", axis=1)
        return X_new

In [None]:
#Test
special_field_features_meshID = ['ConditionMeshId']

df_test = df_raw[special_field_features_meshID].copy()[:5]
display(df_test)
display(df_test.dtypes)

In [None]:
instance = MeshIDTransformer()
df_new = instance.transform(df_test)
df_new.head(10)

##Transformers for feature selection

### Feature selection by forward and backward selection

In [None]:
#Custom transformer that selects features that boost performance most  
class FeatureSelectorTransformerModels( BaseEstimator, TransformerMixin ):
    """
    Transformer for performing feature selection for machine learning or data preprocessing based on model performance
        
    Parameters
    --------
        strategy : String, default = forward
            A string value defining the strategy of the feature selection = forward or backward
        model_type : String, default = linear
            A string value defining the model that should be used (i.e. linear, logistic, ... )
        elimination_criteria : String, default = aic
            A string value defining the elimination criteria based on which the feature selection is done.
            Examples:
            - aic = Akaike information criterion
            - bic = Bayesian information criterion
            - adjr2 = Adjusted R squared
            - r2 = R squared
        varchar_process : String, default = dummy_dropfirst
            Possible values:
            - drop
            - dummy
            - dummy_dropfirst
        targetColumn : Series
            A Series defining the target column
    """
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, 
                 strategy = "backward", 
                 model_type = "linear",
                 elimination_criteria = "aic",
                 varchar_process = "dummy_dropfirst",
                 sl = 0.05,
                 targetColumn = pd.Series()):
        self._strategy = strategy
        self._model_type = model_type
        self._elimination_criteria = elimination_criteria
        self._sl = sl
        self._iterations_log = ""
        self._last_eliminated = ""
        self._varchar_process = varchar_process
        self._targetColumn = targetColumn
 
        
    def fit(self, X, y = None): 
        
        return self
    
    def regressor(self, y, X):
        if self._model_type =="linear":
            regressor = sm.OLS(y, X).fit()
        elif self._model_type == "logistic":
            regressor = sm.Logit(y, X).fit()
        else:
            #print("\nWrong Model Type : "+ self._model_type +"\nLinear model type is seleted.")
            self._model_type = "linear"
            regressor = sm.OLS(y, X).fit()
        return regressor 
    
    def varcharProcessing(self, X, varchar_process = "dummy_dropfirst"):   
        dtypes = X.dtypes
        if varchar_process == "drop":   
            X = X.drop(columns = dtypes[dtypes == np.object].index.tolist())
            #print("Character Variables (Dropped):", dtypes[dtypes == np.object].index.tolist())
        elif varchar_process == "dummy":
            X = pd.get_dummies(X,drop_first=False)
            #print("Character Variables (Dummies Generated):", dtypes[dtypes == np.object].index.tolist())
        elif varchar_process == "dummy_dropfirst":
            X = pd.get_dummies(X,drop_first=True)
            #print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())
        else: 
            X = pd.get_dummies(X,drop_first=True)
            #print("Character Variables (Dummies Generated, First Dummies Dropped):", dtypes[dtypes == np.object].index.tolist())

        X["intercept"] = 1
        cols = X.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        X = X[cols]

        return X

    
    def transform(self, X):
        
        X = self.varcharProcessing(X, varchar_process = self._varchar_process)
        y = self._targetColumn
        #X.drop(columns = [self._targetColumn], inplace = True)
        self._cols = X.columns.tolist()
        X_new = X.copy()
        
        if self._strategy == "backward":   
            for i in range(X_new.shape[1]):
                if i != 0 :          
                    if self._elimination_criteria == "aic":
                        criteria = model.aic
                        new_model = self.regressor(y = y, X = X_new)
                        new_criteria = new_model.aic
                        if criteria < new_criteria:
                            #print("Regained : ", self._last_eliminated)
                            self._iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
                            self._iterations_log += str("\n\nRegained : "+ self._last_eliminated + "\n\n")
                            break  
                    elif self._elimination_criteria == "bic":
                        criteria = model.bic
                        new_model = self.regressor(y = y, X = X_new)
                        new_criteria = new_model.bic
                        if criteria < new_criteria:
                            #print("Regained : ", self._last_eliminated)
                            self._iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
                            self._iterations_log += str("\n\nRegained : "+ self._last_eliminated + "\n\n")
                            break  
                    elif self._elimination_criteria == "adjr2" and model_type =="linear":
                        criteria = model.rsquared_adj
                        new_model = self.regressor(y = y,X = X_new)
                        new_criteria = new_model.rsquared_adj
                        if criteria > new_criteria:
                            #print("Regained : ", self._last_eliminated)
                            self._iterations_log += "\n" + str(new_model.summary()) + "\nAIC: " + str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
                            self._iterations_log += str("\n\nRegained : "+ self._last_eliminated + "\n\n")
                            break  
                    elif self._elimination_criteria == "r2" and model_type =="linear":
                        criteria = model.rsquared
                        new_model = self.regressor(y,X_new)
                        new_criteria = new_model.rsquared
                        if criteria > new_criteria:
                            #print("Regained : ", self._last_eliminated)
                            self._iterations_log += "\n"+str(new_model.summary())+"\nAIC: "+ str(new_model.aic) + "\nBIC: "+ str(new_model.bic)+"\n"
                            self._iterations_log += str("\n\nRegained : "+ self._last_eliminated + "\n\n")
                            break   
                    else: 
                        new_model = self.regressor(y = y,X = X_new)
                    model = new_model
                    self._iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
                else:
                    model = self.regressor(y = y, X = X_new)
                    self._iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
                maxPval = max(model.pvalues)
                selected_cols = X_new.columns.tolist()
                if maxPval > self._sl:
                    for j in selected_cols:
                        if (model.pvalues[j] == maxPval):
                            #print("Eliminated :" ,j)
                            self._iterations_log += str("\n\nEliminated : "+ j + "\n\n")

                            del X_new[j]
                            self._last_eliminated = j
                else:
                    break
            #print(str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic))
            #print("Final Variables:", selected_cols)
            self._iterations_log += "\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n"
        

        elif self._strategy == "forward":
            selected_cols = ["intercept"]
            other_cols = self._cols.copy()
            other_cols.remove("intercept")
            
            for i in range(X_new.shape[1]):
                pvals = pd.DataFrame(columns = ["Cols","Pval"])
                for j in other_cols:
                    model = self.regressor(y, X_new[selected_cols+[j]])
                    pvals = pvals.append(pd.DataFrame([[j, model.pvalues[j]]],columns = ["Cols","Pval"]),ignore_index=True)
                pvals = pvals.sort_values(by = ["Pval"]).reset_index(drop=True)
                pvals = pvals[pvals.Pval<=self._sl]
                if pvals.shape[0] > 0:

                    model = self.regressor(y, X_new[selected_cols+[pvals["Cols"][0]]])
                    self._iterations_log += str("\nEntered : "+pvals["Cols"][0] + "\n")    
                    self._iterations_log += "\n\n"+str(model.summary())+"\nAIC: "+ str(model.aic) + "\nBIC: "+ str(model.bic)+"\n\n"


                    if  self._elimination_criteria == "aic":
                        new_criteria = model.aic
                        if new_criteria < criteria:
                            #print("Entered :", pvals["Cols"][0], "\tAIC :", model.aic)
                            selected_cols.append(pvals["Cols"][0])
                            other_cols.remove(pvals["Cols"][0])
                            criteria = new_criteria
                        else:
                            #print("break : Criteria")
                            break
                    elif  self._elimination_criteria == "bic":
                        new_criteria = model.bic
                        if new_criteria < criteria:
                            #print("Entered :", pvals["Cols"][0], "\tBIC :", model.bic)
                            selected_cols.append(pvals["Cols"][0])
                            other_cols.remove(pvals["Cols"][0])
                            criteria = new_criteria
                        else:
                            #print("break : Criteria")
                            break        
                    elif  self._elimination_criteria == "r2" and model_type =="linear":
                        new_criteria = model.rsquared
                        if new_criteria > criteria:
                            #print("Entered :", pvals["Cols"][0], "\tR2 :", model.rsquared)
                            selected_cols.append(pvals["Cols"][0])
                            other_cols.remove(pvals["Cols"][0])
                            criteria = new_criteria
                        else:
                            #print("break : Criteria")
                            break           
                    elif  self._elimination_criteria == "adjr2" and model_type =="linear":
                        new_criteria = model.rsquared_adj
                        if new_criteria > criteria:
                            #print("Entered :", pvals["Cols"][0], "\tAdjR2 :", model.rsquared_adj)
                            selected_cols.append(pvals["Cols"][0])
                            other_cols.remove(pvals["Cols"][0])
                            criteria = new_criteria
                        else:
                            #print("Break : Criteria")
                            break
                    else:
                        #print("Entered :", pvals["Cols"][0])
                        selected_cols.append(pvals["Cols"][0])
                        other_cols.remove(pvals["Cols"][0])            

                else:
                    #print("Break : Significance Level")
                    break

            model = self.regressor(y, X_new[selected_cols])
            if self._elimination_criteria == "aic":
                criteria = model.aic
            elif self._elimination_criteria == "bic":
                criteria = model.bic
            elif self._elimination_criteria == "r2" and model_type =="linear":
                criteria = model.rsquared
            elif self._elimination_criteria == "adjr2" and model_type =="linear":
                criteria = model.rsquared_adj

            # print(model.summary())
            # print("AIC: "+str(model.aic))
            # print("BIC: "+str(model.bic))
            # print("Final Variables:", selected_cols)
        
        else: 
            print("\nWrong Strategy type: "+ self._strategy +"'\nChoose another one.'")
            
        X_new = X.copy()
        selected_cols.remove('intercept')
        X_new = X_new[selected_cols]        

        return X_new



In [None]:
# df_test = new_data.copy()
# print(len(df_test.columns))
# target = df_raw["EnrollmentDuration"]

# instance = FeatureSelectorTransformerModels(targetColumn = y)
# new = instance.fit_transform(X = df_test)
# print(len(new.columns))
# new.head(10)


### Feature selection by feature characteristics

In [None]:
class FeatureSelectorTransformerAttributes( BaseEstimator, TransformerMixin ):
    """
    Class for performing feature selection for machine learning or data preprocessing.
    
    Implements five different methods to identify features for removal 
    
        1. Find columns with a missing percentage greater than a specified threshold
        2. Find columns with a single unique value
        3. Find collinear variables with a correlation greater than a specified correlation coefficient
        4. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
        5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
        
    Parameters
    --------
    
        target : array or series, default = None
            Array of target labels for training the machine learning model to find feature importances. 
            These can be either binary labels (if ml_task is 'classification') or 
            continuous targets (if ml_task is 'regression').
            If no labels are provided, then the feature importance based methods are not available.
        missing_threshold : float between 0 and 1
            Percentage of missing values of a feature above which the feature is nominated for exclusion.
            Default value = 0.6
        correlation_threshold : float between 0 and 1
            Value of the Pearson correlation cofficient for identifying correlation features
            Percentage of feature correlation above which a feature is eligible for exclusion.
            Default value = 0.98
        ml_task : string
            The machine learning task, either 'classification' or 'regression'
            Default value = "regression"
        cumulative_importance : float between 0 and 1
            The fraction of cumulative importance to account for
            

        
    Attributes
    --------
    
    ops : dict
        Dictionary of operations run and features identified for removal
        
    missing_stats : dataframe
        The fraction of missing values for all features
    
    record_missing : dataframe
        The fraction of missing values for features with missing fraction above threshold
        
    unique_stats : dataframe
        Number of unique values for all features
    
    record_single_unique : dataframe
        Records the features that have a single unique value
        
    corr_matrix : dataframe
        All correlations between all features in the data
    
    record_collinear : dataframe
        Records the pairs of collinear variables with a correlation coefficient above the threshold
        
    feature_importances : dataframe
        All feature importances from the gradient boosting machine
    
    record_zero_importance : dataframe
        Records the zero importance features in the data according to the gbm
    
    record_low_importance : dataframe
        Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm
    
    
    Notes
    --------
    
        - All 5 operations can be run with the `identify_all` method.
        - If using feature importances, one-hot encoding is used for categorical variables which creates new columns

      https://towardsdatascience.com/a-feature-selection-tool-for-machine-learning-in-python-b64dd23710f0
    
    """
    
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, 
                 target, 
                 missing_threshold = 0.6,
                 correlation_threshold = 0.98,
                 ml_task = "regression",
                 cumulative_importance = 0.99
                ):
        
        self.one_hot_features = None
        self.ml_task = ml_task
        
        # Dataframes recording information about features to remove
        self.record_missing = None
        self.record_single_unique = None
        self.record_collinear = None
        self.record_zero_importance = None
        self.record_low_importance = None
        
        self.missing_stats = None
        self.unique_stats = None
        self.corr_matrix = None
        self.feature_importances = None
        
        # Dictionary to hold removal operations
        self.ops = {}
        
        self.one_hot_correlated = False
        
        self.target = target
        
        # Thresholds
        self.missing_threshold = missing_threshold
        self.correlation_threshold = correlation_threshold
        self.cumulative_importance = cumulative_importance

        
    def fit( self, X, y = None ):
        self.base_features = list(X.columns)
        return self

    def identify_missing(self, X):
        """Find the features with a fraction of missing values above `missing_threshold`"""

        # Calculate the fraction of missing in each column 
        missing_series = X.isnull().sum() / X.shape[0]
        self.missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})

        # Sort with highest number of missing values on top
        self.missing_stats = self.missing_stats.sort_values('missing_fraction', ascending = False)

        # Find the columns with a missing percentage above the threshold
        record_missing = pd.DataFrame(
            missing_series[missing_series > self.missing_threshold]).reset_index().rename(columns = {'index': 'feature', 
                                                                                                     0: 'missing_fraction'})
        to_drop = list(record_missing['feature'])

        self.record_missing = record_missing
        self.ops['missing'] = to_drop
        
        # print('%d features with greater than %0.2f missing values.\n' % (len(self.ops['missing']), self.missing_threshold))
   

    def identify_single_unique(self, X):
        """Finds features with only a single unique value. NaNs do not count as a unique value. """

        # Calculate the unique counts in each column
        unique_counts = X.nunique()
        self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'})
        self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True)
        
        # Find the columns with only one unique count
        record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature', 
                                                                                                                0: 'nunique'})

        to_drop = list(record_single_unique['feature'])
    
        self.record_single_unique = record_single_unique
        self.ops['single_unique'] = to_drop
        
        # print('%d features with a single unique value.\n' % len(self.ops['single_unique']))
        
    
    def identify_collinear(self, X, one_hot=False):
        """
        Finds collinear features based on the correlation coefficient between features. 
        For each pair of features with a correlation coefficient greather than `correlation_threshold`,
        only one of the pair is identified for removal. 
        Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
        
        Parameters
        --------
        one_hot : boolean, default = False
            Whether to one-hot encode the features before calculating the correlation coefficients
        """
        self.one_hot_correlated = one_hot
        
         # Calculate the correlations between every column
        if one_hot:
            
            # One hot encoding
            features = pd.get_dummies(X)
            self.one_hot_features = [column for column in features.columns if column not in self.base_features]

            # Add one hot encoded data to original data
            self.data_all = pd.concat([features[self.one_hot_features], X], axis = 1)
            
            corr_matrix = pd.get_dummies(features).corr()

        else:
            corr_matrix = X.corr()
        
        self.corr_matrix = corr_matrix
    
        # Extract the upper triangle of the correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
        
        # Select the features with correlations above the threshold
        # Need to use the absolute value
        to_drop = [column for column in upper.columns if any(upper[column].abs() > self.correlation_threshold)]

        # Dataframe to hold correlated pairs
        record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])

        # Iterate through the columns to drop to record pairs of correlated features
        for column in to_drop:

            # Find the correlated features
            corr_features = list(upper.index[upper[column].abs() > self.correlation_threshold])

            # Find the correlated values
            corr_values = list(upper[column][upper[column].abs() > self.correlation_threshold])
            drop_features = [column for _ in range(len(corr_features))]    

            # Record the information (need a temp df for now)
            temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                             'corr_feature': corr_features,
                                             'corr_value': corr_values})

            # Add to dataframe
            record_collinear = record_collinear.append(temp_df, ignore_index = True)

        self.record_collinear = record_collinear
        self.ops['collinear'] = to_drop
        
        # print('%d features with a correlation magnitude greater than %0.2f.\n' % (len(self.ops['collinear']), self.correlation_threshold))

        
    def identify_zero_importance(self, X, eval_metric="l2", 
                                 n_iterations=10, early_stopping = True):
        """
        
        Identify the features with zero importance according to a gradient boosting machine.
        The gbm can be trained with early stopping using a validation set to prevent overfitting. 
        The feature importances are averaged over `n_iterations` to reduce variance. 
        
        Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html)
        Parameters 
        --------
        eval_metric : string
            Evaluation metric to use for the gradient boosting machine for early stopping. Must be
            provided if `early_stopping` is True
        n_iterations : int, default = 10
            Number of iterations to train the gradient boosting machine
            
        early_stopping : boolean, default = True
            Whether or not to use early stopping with a validation set when training
        
        
        Notes
        --------
        
        - Features are one-hot encoded to handle the categorical variables before training.
        - The gbm is not optimized for any particular task and might need some hyperparameter tuning
        - Feature importances, including zero importance features, can change across runs
        """

        if early_stopping and eval_metric is None:
            raise ValueError("""eval metric must be provided with early stopping. Examples include "auc" for classification or
                             "l2" for regression.""")
            
        if self.target is None:
            raise ValueError("No training labels provided.")
        
        # One hot encoding
        features = pd.get_dummies(X)
        self.one_hot_features = [column for column in features.columns if column not in self.base_features]

        # Add one hot encoded data to original data
        self.data_all = pd.concat([features[self.one_hot_features], X], axis = 1)

        # Extract feature names
        feature_names = list(features.columns)

        # Convert to np array
        features = np.array(features)
        target = np.array(self.target).reshape((-1, ))

        # Empty array for feature importances
        feature_importance_values = np.zeros(len(feature_names))
        
        # print('Training Gradient Boosting Model\n')
        
        # Iterate through each fold
        for _ in range(n_iterations):

            if self.ml_task == 'classification':
                model = lgb.LGBMClassifier(n_estimators=1000, learning_rate = 0.05, verbose = -1)

            elif self.ml_task == 'regression':
                model = lgb.LGBMRegressor(n_estimators=1000, learning_rate = 0.05, verbose = -1)

            else:
                raise ValueError('Task must be either "classification" or "regression"')
                
            # If training using early stopping need a validation set
            if early_stopping:
                
                train_features, valid_features, train_labels, valid_labels = train_test_split(features, 
                                                                                              target, 
                                                                                              test_size = 0.33,
                                                                                              random_state=42)

                # Train the model with early stopping
                model.fit(train_features, train_labels, eval_metric = eval_metric,
                          eval_set = [(valid_features, valid_labels)],
                          early_stopping_rounds = 100, verbose = -1)
                
                # Clean up memory
                gc.enable()
                del train_features, train_labels, valid_features, valid_labels
                gc.collect()
                
            else:
                model.fit(features, target)

            # Record the feature importances
            feature_importance_values += model.feature_importances_ / n_iterations

        feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

        # Sort features according to importance
        feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

        # Normalize the feature importances to add up to one
        feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
        feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])

        # Extract the features with zero importance
        record_zero_importance = feature_importances[feature_importances['importance'] == 0.0]
        
        to_drop = list(record_zero_importance['feature'])

        self.feature_importances = feature_importances
        self.record_zero_importance = record_zero_importance
        self.ops['zero_importance'] = to_drop
        
        # print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance']))
                     
            
    def identify_low_importance(self, X):
        """
        Finds the lowest importance features not needed to account for `cumulative_importance` fraction
        of the total feature importance from the gradient boosting machine. As an example, if cumulative
        importance is set to 0.95, this will retain only the most important features needed to 
        reach 95% of the total feature importance. The identified features are those not needed.
        """
        
        # The feature importances need to be calculated before running
        if self.feature_importances is None:
            raise NotImplementedError("""Feature importances have not yet been determined. 
                                         Call the `identify_zero_importance` method first.""")
            
        # Make sure most important features are on top
        self.feature_importances = self.feature_importances.sort_values('cumulative_importance')

        # Identify the features not needed to reach the cumulative_importance
        record_low_importance = self.feature_importances[self.feature_importances['cumulative_importance'] > 
                                                         self.cumulative_importance]

        to_drop = list(record_low_importance['feature'])

        self.record_low_importance = record_low_importance
        self.ops['low_importance'] = to_drop
    
        # print('%d features required for cumulative importance of %0.2f after one hot encoding.' % 
        #       (len(self.feature_importances) - len(self.record_low_importance), self.cumulative_importance))
        # print('%d features do not contribute to cumulative importance of %0.2f.\n' % (len(self.ops['low_importance']),
        #                                                                                        self.cumulative_importance))
        
    def remove(self, X, methods, keep_one_hot = True):
        """
        Remove the features from the data according to the specified methods.
        
        Parameters
        --------
            methods : 'all' or list of methods
                If methods == 'all', any methods that have identified features will be used
                Otherwise, only the specified methods will be used.
                Can be one of ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance']
            keep_one_hot : boolean, default = True
                Whether or not to keep one-hot encoded features
                
        Return
        --------
            data : dataframe
                Dataframe with identified features removed
                
        
        Notes 
        --------
            - If feature importances are used, the one-hot encoded columns will be added to the data (and then may be removed)
            - Check the features that will be removed before transforming data!
        
        """
        
        
        features_to_drop = []
      
        if methods == 'all':
            
            # Need to use one-hot encoded data as well
            data = self.data_all
                                          
           # print('{} methods have been run\n'.format(list(self.ops.keys())))
            
            # Find the unique features to drop
            features_to_drop = set(list(chain(*list(self.ops.values()))))
            
        else:
            # Need to use one-hot encoded data as well
            if 'zero_importance' in methods or 'low_importance' in methods or self.one_hot_correlated:
                data = self.data_all
                
            else:
                data = X
                
            # Iterate through the specified methods
            for method in methods:
                
                # Check to make sure the method has been run
                if method not in self.ops.keys():
                    raise NotImplementedError('%s method has not been run' % method)
                    
                # Append the features identified for removal
                else:
                    features_to_drop.append(self.ops[method])
        
            # Find the unique features to drop
            features_to_drop = set(list(chain(*features_to_drop)))
            
        features_to_drop = list(features_to_drop)
            
        if not keep_one_hot:
            
            if self.one_hot_features is None:
                print('Data has not been one-hot encoded')
            else:
                             
                features_to_drop = list(set(features_to_drop) | set(self.one_hot_features))
       
        # Remove the features and return the data
        data = data.drop(columns = features_to_drop)
        self.removed_features = features_to_drop
        
        # if not keep_one_hot:
        # 	print('Removed %d features including one-hot features.' % len(features_to_drop))
        # else:
        # 	print('Removed %d features.' % len(features_to_drop))
        
        return data
        
    def reset_plot(self):
        plt.rcParams = plt.rcParamsDefault

    
    def transform(self, X , y = None ):
        
        X_new = X.copy()
        #1. Find columns with a missing percentage greater than a specified threshold
        self.identify_missing(X_new)
        # Find features identified for removal by accessing the ops attribute of the FeatureSelector
        missing_features = self.ops['missing']
        missing_features[:5]
        
        #2. Find columns with a single unique value
        self.identify_single_unique(X_new)
        
        #3. Find collinear variables with a correlation greater than a specified correlation coefficient
        # Identify collinear features
        self.identify_collinear(X = X_new)        
        # list of collinear features to remove
        collinear_features = self.ops['collinear']
        # dataframe of collinear features
        self.record_collinear.head()        
        
        #4. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
        # Pass in the appropriate parameters
        self.identify_zero_importance(X = X_new,
                                      eval_metric = 'auc',
                                      n_iterations = 10,
                                      early_stopping = True)
        # list of zero importance features
        zero_importance_features = self.ops['zero_importance']
        
        #5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
        self.identify_low_importance(X = X_new)
        self.feature_importances.head(10)
        
        # Remove the features from all methods 
        # To also remove the one-hot encoded features that are created during machine learning, set "keep_one_hot" to True
        X_new = self.remove(X = X_new, methods = 'all', keep_one_hot=False)
                          

        return X_new

In [None]:
# df_test = new_data.copy()

# display(df_test.describe())

# target = df_raw["EnrollmentDuration"]
# print(target)
# instance = FeatureSelectorTransformerAttributes(target = target)
# new = instance.fit_transform(X = df_test)
# new.head(10)


##Standard Scaler for numerical values

In [None]:
class StandardScalerTransformer(TransformerMixin):
    """ Applies StandardScaler on data and returns a dataframe """

    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X)
        self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
        return self

    def transform(self, X):
        Xss = self.ss.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled

##Target Transformers

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

class LabelEncoder(TransformerMixin):
    """
    Strategy:
        equal_interval - same bin size (needs frequency=interval size)
        equal_frequency - same number of occurrences in each bin (needs n_bins and labels)
    """
    def __init__(self, strategy='equal_frequency', n_bins = 5, labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long'], frequency = 12):
        self.strategy = strategy
        self.n_bins = n_bins
        self.labels = labels
        self.frequency = frequency

    def fit(self, X):
        if self.strategy == 'equal_interval':
            print(X.max())
            interval_range = pd.interval_range(start=0, freq=self.frequency, end=int(X.max())+self.frequency)
            self.groups, bin_edges = pd.cut(X[X.columns[0]], bins=interval_range, retbins=True, labels=False)
            self.results_table = pd.DataFrame(zip(bin_edges, range(0,len(interval_range))), columns=['Group', 'Label'])
            self.results = []
            for value in X[X.columns[0]]:
                for index, data in self.results_table.iterrows():
                    if value in data.Group: self.results.append(data.Label)
            display(self.results_table)
        else:
           self.results, bin_edges = pd.qcut(X[X.columns[0]], q=self.n_bins, labels=False, retbins=True)
           self.results_table = pd.DataFrame(zip(bin_edges, self.labels, range(0,len(self.labels))), columns=['Start Value', 'Label', 'Code'])
           display(self.results_table)
        return self

    def transform(self, X):
        df_test = X.copy()
        df_test['new'] = self.results
        display(df_test)
        X_new = pd.DataFrame(self.results, index=X.index, columns=X.columns)
        return X_new

In [None]:
df_test = df_raw.loc[:, ['EnrollmentDuration']].copy()

display(df_test.describe())

instance = LabelEncoder(strategy='equal_frequency')
new = instance.fit_transform(df_test)
display(new)
#display(new['EnrollmentDuration'].value_counts())

##Training, Test, and Validation Sets




In [None]:
#Write Transformer to integrate split into our pipeline
class DataSplit( BaseEstimator, TransformerMixin ):
  def _init_( self, debug=False ):
        self._debug = debug

  def fit( self, X, y = None ):
        return self  

  def transform( self, X, y = None ):
        #Set seed
        import random
        random.seed(2020)

        #Split data to X and y
        y_new = X["EnrollmentDuration"]
        X_new = X.drop(X["EnrollmentDuration"])

        #Split into test and train
        X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.1, random_state=42)

        #Split further into train test and validation
        V_train, V_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        return X_train, y_train, X_test, y_test, V_train, V_test

In [None]:
class DataSplit( BaseEstimator, TransformerMixin ):
  """
  Strategy:
            random_sampling - randomly put trials into train and test
            stratified_sampling - make sure trials are evenl distributed in test and train set on a certain feature
            timeseries_sampling - avoid using future data to train model, split train an tets by StartDate
  """
  def __init__( self, strategy = "random_sampling", debug=False ):
        self.strategy = strategy
        self._debug = debug

  def fit( self, X, y = None ):
        return self  

  def transform( self, X, y = None ):
        #Set seed
        import random
        random.seed(2020)

        #1 Random Sampling
        if self.strategy== "random_sampling":

            #Split data to X and y
            y_new = X["EnrollmentDuration"]
            X_new = X.drop(X["EnrollmentDuration"])

            #Split into test and train
            X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.1, random_state=42)

            #Split further into train test and validation
            V_train, V_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

        #2 Stratified Sampling
        if self.strategy=="stratified_sampling":

            X_new = X.copy()
            X_new["EnrollmentDurationBins"]=0

            X_new['EnrollmentDurationBins'].loc[X_new['EnrollmentDuration']<13]=0
            X_new['EnrollmentDurationBins'].loc[(X_new['EnrollmentDuration']>=13) & (X_new['EnrollmentDuration']<23)]=1
            X_new['EnrollmentDurationBins'].loc[(X_new['EnrollmentDuration']>=23) & (X_new['EnrollmentDuration']<37)]=2
            X_new['EnrollmentDurationBins'].loc[(X_new['EnrollmentDuration']>=37) & (X_new['EnrollmentDuration']<60)]=3
            X_new['EnrollmentDurationBins'].loc[X_new['EnrollmentDuration']>60]=4

            y_new = X["EnrollmentDuration"]
            X_new = X_new.drop(X["EnrollmentDuration"])

            #Split into test and train
            X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, stratify=new_data["EnrollmentDurationBins"], test_size=0.1, random_state=42)

            #Split further into train test and validation
            V_train, V_test, y_train, y_test = train_test_split(X_train, y_train, stratify=new_data["EnrollmentDurationBins"], test_size=0.2, random_state=42)

        #3 Time Series Sampling
        if self.strategy=="timeseries_sampling":
            X_new = X.sort_values(by="StartYear")

            y_new= X_new["EnrollmentDuration"]
            X_new = X_new.drop("EnrollmentDuration", axis =1)

            #Time series split
            train_size = int(len(X_new) * 0.9)
            X_train, X_test = X_new[0:train_size], X_new[train_size:len(X_new)]
            y_train, y_test = y_new[0:train_size], y_new[train_size:len(y_new)]
            
            validation_size = int(len(X_train) * 0.8)
            V_train, V_test = X_train[0:validation_size], X_train[validation_size:len(X_train)]
            v_train, v_test = y_train[0:validation_size], y_train[validation_size:len(y_train)]

        return X_train, y_train, X_test, y_test, V_train, V_test

In [None]:
df_raw.head()

#Final Pipeline

In [None]:
TARGET = ['EnrollmentDuration']
UNIQUE_FEATS = ['ConditionMeshId']
CAT_SINGLE_FEATS = ['HealthyVolunteers', 'Gender', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'DesignPrimaryPurpose', 'EnrollmentType', 'OrgClass','DesignAllocation','DesignInterventionModel']
CAT_MULTIPLE_FEATS = ['Phase', 'StdAge', 'CollaboratorClass', 'LeadSponsorClass', 'ConditionBrowseBranchAbbrev','InterventionType','ArmGroupType','BaselineDenomCountGroupId','BaselineMeasureDispersionType']
CAT_MULTIPLE_TOP_FEATS1 = ['LocationCountry', 'LocationCity', 'LocationFacility']
CAT_MULTIPLE_TOP_FEATS2 = ['Condition', 'ConditionAncestorTerm','Keyword']
NUM_FEATS = ['EnrollmentCount']
AGE_FEATS =['MaximumAge','MinimumAge']
TO_COUNT_FEATS = ['Condition', 'ConditionAncestorTerm', 'CollaboratorClass', 'EligibilityCriteria', 'LocationFacility', 'LocationCity', 'LocationCountry', 'ArmGroupLabel']
TIME_FEAT = ['StartDate']
TEXTUAL_FEATS1 = ['InterventionName', 'OrgFullName','LeadSponsorName', 'CollaboratorName', 'EligibilityCriteria', 'ArmGroupDescription', 'ArmGroupInterventionName', 'ArmGroupLabel', 'OfficialTitle', 'BriefSummary']
TEXTUAL_FEATS2 = ['EventsTimeFrame', 'FlowDropWithdrawType', 'FlowGroupDescription', 'FlowGroupTitle', 'FlowMilestoneType', 'FlowPeriodTitle', 'FlowRecruitmentDetails', 'BaselineCategoryTitle', 'BaselineClassTitle', 'BaselineGroupDescription', 'BaselineGroupTitle', 
                    'BaselineMeasureTitle', 'BaselineMeasureUnitOfMeasure']
WORLDRANK_FEATS = ['LocationFacility']
REGIONAL_FEATS = ['LocationCity', 'LocationState', 'MaximumAge', 'MinimumAge']
POPULATION_FEAT = ['LocationPopulationDensity']

ALL_FEATURES = list(set(UNIQUE_FEATS + CAT_SINGLE_FEATS + CAT_MULTIPLE_FEATS + CAT_MULTIPLE_TOP_FEATS1 +  CAT_MULTIPLE_TOP_FEATS2 + NUM_FEATS + TO_COUNT_FEATS + TEXTUAL_FEATS1 + TEXTUAL_FEATS2 + AGE_FEATS  + TIME_FEAT + POPULATION_FEAT + WORLDRANK_FEATS + REGIONAL_FEATS))

# display(df_raw.head(5))

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('target', Pipeline([
            ('extract', FeatureSelector(TARGET))
        ])),
        ('categoricals_single', Pipeline([                 
           ('extract', FeatureSelector(CAT_SINGLE_FEATS)),
           ('cat_fill', MissingStringsTransformer(strategy='most_frequent')),
           ('single_one_hot_encoding', SingleOneHotEncoder()),
           ('excluder', FeatureExcluder(CAT_SINGLE_FEATS))
        ])),
        ('categoricals_multiple', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_FEATS)),
            ('multiple_one_hot_encoding', MultipleOneHotEncoder()),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_FEATS))
        ])),
        ('categoricals_top1', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_TOP_FEATS1)),
            ('multiple_one_hot_encoding', MultipleTopOneHotEncoder(strategie="top", top=50)),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_TOP_FEATS1))
        ])),
        ('categoricals_top2', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_TOP_FEATS2)),
            ('multiple_one_hot_encoding', MultipleTopOneHotEncoder(strategie="top", top=100)),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_TOP_FEATS2))
        ])),
        ('startMonth', Pipeline([
            ('extract', FeatureSelector(TIME_FEAT)),      
            ('startMonth', StartMonthTransformer()),
            ('excluder', FeatureExcluder(TIME_FEAT))
        ])),
        ('startYear', Pipeline([
            ('extract', FeatureSelector(TIME_FEAT)),      
            ('startYear', StartYearTransformer()),
            ('excluder', FeatureExcluder(TIME_FEAT))
        ])),
        ('Age_features',Pipeline([
            ('extract', FeatureSelector(AGE_FEATS)),
            ('cat_fill', MissingStringsTransformer(strategy='most_frequent')),
            ('toYear', ToYearTransformer()),
            ('excluder', FeatureExcluder(AGE_FEATS))                      
        ])),
        ('counting_features', Pipeline([
            ('extract', FeatureSelector(TO_COUNT_FEATS)),
            ('counter', DistinctCounter()),
            ('excluder', FeatureExcluder(TO_COUNT_FEATS))
        ])),
        ('textual_features', Pipeline([
            ('extract', FeatureSelector(TEXTUAL_FEATS1)),
            ('counter', TextualFeatureTransformer( n_keywords = 35)),
            ('excluder', FeatureExcluder(TEXTUAL_FEATS1))
        ])),
        ('textual_features', Pipeline([
            ('extract', FeatureSelector(TEXTUAL_FEATS2)),
            ('counter', TextualFeatureTransformer( n_keywords = 15)),
            ('excluder', FeatureExcluder(TEXTUAL_FEATS2))
        ])),
        ('numerics', Pipeline([
            ('extract', FeatureSelector(NUM_FEATS)),
            ('nr_fill', MissingValuesTransformer(imputer = 'KNNImputer', n_neighbors = 5, weights = "uniform")),
            ('excluder', FeatureExcluder(NUM_FEATS))
        ])),
        ('special', Pipeline([
            ('extract', FeatureSelector(UNIQUE_FEATS)),
            ('meshid_transformer', MeshIDTransformer()),
            ('excluder', FeatureExcluder(UNIQUE_FEATS))
        ])),
         ('worldrank', Pipeline([
             ('extract', FeatureSelector(WORLDRANK_FEATS)),
             ('worldrank_transformer', WorldRankTransformer(worldrankData = df_worldrank)),
             ('excluder', FeatureExcluder(WORLDRANK_FEATS))
         ])),
        ('regional', Pipeline([
            ('extract', FeatureSelector(REGIONAL_FEATS)),
            ('regional_transformer', RegionalAgeTransformer(countrylist=countrylist, df_regional=df_regional)),
            ('excluder', FeatureExcluder(REGIONAL_FEATS))
        ])),
        ('addFeatures', Pipeline([
            ('extract', FeatureSelector(ALL_FEATURES))
        ]))
        
    ])),
    ('patients_distribution', PatientsDistributionTransformer()),
    # Below is the combination of all alternative transformers. parameters including:
    # - "transformer" one of three: => 'totalCombine', 'perCity', 'perCountry'
    # - "strategy": => 'simple', 'weighted'
    # - "mean": => 'trials', 'worldwide'
    # Strategy and mean are only need if transformer = 'totalCombine'
    ('location_transformation', LocationDataTransformer(df_dbcountry, transformer='totalCombine', strategy='weighted', mean='worldwide')),
    ('excluder', FeatureExcluder(ALL_FEATURES))
])

df_copy = df_raw.copy()
new_data = pipeline.fit_transform(df_copy[list(set(TARGET + ALL_FEATURES))])
new_data.to_csv(f"pipeline_output.csv", sep=";")

print(f"Length of new_data: {len(new_data)}")
print(f"Number of features: {len(new_data.columns)}")
display(new_data.head(5))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Length of new_data: 982
Number of features: 1043


Unnamed: 0,EnrollmentDuration,DesignAllocation=Non-Randomized,DesignAllocation=Randomized,DesignInterventionModel=Crossover Assignment,DesignInterventionModel=Factorial Assignment,DesignInterventionModel=Parallel Assignment,DesignInterventionModel=Sequential Assignment,DesignInterventionModel=Single Group Assignment,DesignPrimaryPurpose=Diagnostic,DesignPrimaryPurpose=Prevention,DesignPrimaryPurpose=Supportive Care,DesignPrimaryPurpose=Treatment,EnrollmentType=Actual,EnrollmentType=Anticipated,Gender=All,Gender=Female,Gender=Male,HealthyVolunteers=Accepts Healthy Volunteers,HealthyVolunteers=No,IsFDARegulatedDevice=No,IsFDARegulatedDevice=Yes,IsFDARegulatedDrug=No,IsFDARegulatedDrug=Yes,OrgClass=FED,OrgClass=INDUSTRY,OrgClass=NETWORK,OrgClass=NIH,OrgClass=OTHER,Phase=Phase 2,Phase=Phase 3,Phase=Phase 1,StdAge=Child,StdAge=Older Adult,StdAge=Adult,CollaboratorClass=NIH,CollaboratorClass=NETWORK,CollaboratorClass=UNKNOWN,CollaboratorClass=FED,CollaboratorClass=OTHER,CollaboratorClass=INDUSTRY,...,main_country_unemploymentRate,avg_hospitalBed,min_hospitalBed,max_hospitalBed,main_country_hospitalBed,avg_healthExpenditure,min_healthExpenditure,max_healthExpenditure,main_country_healthExpenditure,avg_density,min_density,max_density,main_country_density,avg_fertilityRate,min_fertilityRate,max_fertilityRate,main_country_fertilityRate,avg_medianAge,min_medianAge,max_medianAge,main_country_medianAge,avg_migrantsNet,min_migrantsNet,max_migrantsNet,main_country_migrantsNet,avg_sizeInKm2,min_sizeInKm2,max_sizeInKm2,main_country_sizeInKm2,avg_urbanPopulation,min_urbanPopulation,max_urbanPopulation,main_country_urbanPopulation,avg_worldshare,min_worldshare,max_worldshare,main_country_worldshare,avg_city_population,min_city_population,max_city_population
0,37,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,3588.069664,158.630005,11377.091797
1,172,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,1717.821167,1717.821167,1717.821167
2,39,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,2541.599437,61.538631,11377.091797
15,93,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,241.777069,241.777069,241.777069
16,123,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,17.0,17.0,17.0,17.0,36.0,36.0,36.0,36.0,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,954806.0,954806.0,954806.0,954806.0,9147420.0,9147420.0,9147420.0,9147420.0,83.0,83.0,83.0,83.0,4.25,4.25,4.25,4.25,7901.219238,7901.219238,7901.219238


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/My\ Drive/Team\ Project\ (Sovanta)/Google\ Collab/

In [None]:
new_data.to_csv("dataframe_after_preprocessing.csv", index=False)

# please also run this code to save pipeline settings (needed for backend of our web application)
import joblib
joblib.dump(pipeline, 'final_pipeline.joblib')

# Final Pipeline (to use in target and categorical encoding)








In [None]:
class TargetConditionTransformer( TransformerMixin, BaseEstimator ):
    def __init__( self, debug=False):
        self._debug = debug

    def fit( self, X, y = None ):
        return self 
    
    def transform(self, X , y = None ):

        X_new = pd.DataFrame(X)
        X_new=X_new.reset_index()
        
        X_new["Condition1"]=0
        X_new["Condition2"]=0

        for i, row in X_new.iterrows():
          X_new["Condition1"].iloc[i]= X_new["Condition"].iloc[i][0]
          if len(X_new["Condition"].iloc[i])>1:
            X_new["Condition2"].iloc[i]= X_new["Condition"].iloc[i][1]
          else:
            X_new["Condition2"].iloc[i]= "No second Condition"
        return X_new

In [None]:
TARGET = ['EnrollmentDuration']
UNIQUE_FEATS = ['ConditionMeshId']
CAT_SINGLE_FEATS = ['HealthyVolunteers', 'Gender', 'IsFDARegulatedDrug', 'IsFDARegulatedDevice', 'DesignPrimaryPurpose', 'EnrollmentType', 'OrgClass','DesignAllocation','DesignInterventionModel']
CAT_MULTIPLE_FEATS = ['Phase', 'StdAge', 'CollaboratorClass', 'LeadSponsorClass', 'ConditionBrowseBranchAbbrev','InterventionType','ArmGroupType','BaselineDenomCountGroupId','BaselineMeasureDispersionType']
CAT_MULTIPLE_TOP_FEATS1 = ['LocationCountry', 'LocationCity', 'LocationFacility']
CAT_MULTIPLE_TOP_FEATS2 = ['Condition', 'ConditionAncestorTerm','Keyword']
NUM_FEATS = ['EnrollmentCount']
AGE_FEATS =['MaximumAge','MinimumAge']
TO_COUNT_FEATS = ['Condition', 'ConditionAncestorTerm', 'CollaboratorClass', 'EligibilityCriteria', 'LocationFacility', 'LocationCity', 'LocationCountry', 'ArmGroupLabel']
TIME_FEAT = ['StartDate']
TEXTUAL_FEATS1 = ['InterventionName', 'OrgFullName','LeadSponsorName', 'CollaboratorName', 'EligibilityCriteria', 'ArmGroupDescription', 'ArmGroupInterventionName', 'ArmGroupLabel', 'OfficialTitle', 'BriefSummary']
TEXTUAL_FEATS2 = ['EventsTimeFrame', 'FlowDropWithdrawType', 'FlowGroupDescription', 'FlowGroupTitle', 'FlowMilestoneType', 'FlowPeriodTitle', 'FlowRecruitmentDetails', 'BaselineCategoryTitle', 'BaselineClassTitle', 'BaselineGroupDescription', 'BaselineGroupTitle', 
                    'BaselineMeasureTitle', 'BaselineMeasureUnitOfMeasure']
WORLDRANK_FEATS = ['LocationFacility']
REGIONAL_FEATS = ['LocationCity', 'LocationState', 'MaximumAge', 'MinimumAge']
POPULATION_FEAT = ['LocationPopulationDensity']

ALL_FEATURES = list(set(UNIQUE_FEATS +  CAT_MULTIPLE_FEATS + CAT_MULTIPLE_TOP_FEATS1 +  CAT_MULTIPLE_TOP_FEATS2 + NUM_FEATS + TO_COUNT_FEATS + TEXTUAL_FEATS + AGE_FEATS  + TIME_FEAT + POPULATION_FEAT + WORLDRANK_FEATS + REGIONAL_FEATS))

# display(df_raw.head(5))

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('target', Pipeline([
            ('extract', FeatureSelector(TARGET))
        ])),
        ('targetcondition', Pipeline([ 
        ('extract', FeatureSelector('Condition')),
        ('target_Condition',TargetConditionTransformer())
        ])),
        ('categoricals_single', Pipeline([                 
           ('extract', FeatureSelector(CAT_SINGLE_FEATS)),
           ('cat_fill', MissingStringsTransformer(strategy='most_frequent'))
        ])),
          ('categoricals_multiple', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_FEATS)),
            ('multiple_one_hot_encoding', MultipleOneHotEncoder()),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_FEATS))
        ])),
        ('categoricals_top1', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_TOP_FEATS1)),
            ('multiple_one_hot_encoding', MultipleTopOneHotEncoder(strategie="top", top=50)),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_TOP_FEATS1))
        ])),
        ('categoricals_top2', Pipeline([
            ('extract', FeatureSelector(CAT_MULTIPLE_TOP_FEATS2)),
            ('multiple_one_hot_encoding', MultipleTopOneHotEncoder(strategie="top", top=100)),
            ('excluder', FeatureExcluder(CAT_MULTIPLE_TOP_FEATS2))
        ])),
        ('startMonth', Pipeline([
            ('extract', FeatureSelector(TIME_FEAT)),      
            ('startMonth', StartMonthTransformer()),
            ('excluder', FeatureExcluder(TIME_FEAT))
        ])),
        ('startYear', Pipeline([
            ('extract', FeatureSelector(TIME_FEAT)),      
            ('startYear', StartYearTransformer()),
            ('excluder', FeatureExcluder(TIME_FEAT))
        ])),
        ('Age_features',Pipeline([
            ('extract', FeatureSelector(AGE_FEATS)),
            ('cat_fill', MissingStringsTransformer(strategy='most_frequent')),
            ('toYear', ToYearTransformer()),
            ('excluder', FeatureExcluder(AGE_FEATS))                      
        ])),
        ('counting_features', Pipeline([
            ('extract', FeatureSelector(TO_COUNT_FEATS)),
            ('counter', DistinctCounter()),
            ('excluder', FeatureExcluder(TO_COUNT_FEATS))
        ])),
        ('textual_features', Pipeline([
            ('extract', FeatureSelector(TEXTUAL_FEATS1)),
            ('counter', TextualFeatureTransformer( n_keywords = 35)),
            ('excluder', FeatureExcluder(TEXTUAL_FEATS1))
        ])),
        ('textual_features', Pipeline([
            ('extract', FeatureSelector(TEXTUAL_FEATS2)),
            ('counter', TextualFeatureTransformer( n_keywords = 15)),
            ('excluder', FeatureExcluder(TEXTUAL_FEATS2))
        ])),
        ('numerics', Pipeline([
            ('extract', FeatureSelector(NUM_FEATS)),
            ('nr_fill', MissingValuesTransformer(imputer = 'KNNImputer', n_neighbors = 5, weights = "uniform")),
            ('excluder', FeatureExcluder(NUM_FEATS))
        ])),
        ('special', Pipeline([
            ('extract', FeatureSelector(UNIQUE_FEATS)),
            ('meshid_transformer', MeshIDTransformer()),
            ('excluder', FeatureExcluder(UNIQUE_FEATS))
        ])),
         ('worldrank', Pipeline([
             ('extract', FeatureSelector(WORLDRANK_FEATS)),
             ('worldrank_transformer', WorldRankTransformer(worldrankData = df_worldrank)),
             ('excluder', FeatureExcluder(WORLDRANK_FEATS))
         ])),
        ('regional', Pipeline([
            ('extract', FeatureSelector(REGIONAL_FEATS)),
            ('regional_transformer', RegionalAgeTransformer()),
            ('excluder', FeatureExcluder(REGIONAL_FEATS))
        ])),
        ('addFeatures', Pipeline([
            ('extract', FeatureSelector(ALL_FEATURES))
        ]))
        
    ])),
    ('patients_distribution', PatientsDistributionTransformer()),
    # Below is the combination of all alternative transformers. parameters including:
    # - "transformer" one of three: => 'totalCombine', 'perCity', 'perCountry'
    # - "strategy": => 'simple', 'weighted'
    # - "mean": => 'trials', 'worldwide'
    # Strategy and mean are only need if transformer = 'totalCombine'
    ('location_transformation', LocationDataTransformer(df_dbcountry, transformer='totalCombine', strategy='weighted', mean='worldwide')),
    ('excluder', FeatureExcluder(ALL_FEATURES))
])

df_copy = df_raw.copy()
target_data = pipeline.fit_transform(df_copy[list(set(TARGET + ALL_FEATURES + CAT_SINGLE_FEATS ))])
target_data = target_data.drop(columns = ['index'])
target_data.to_csv(f"pipeline_target_output.csv", sep=";")

print(f"Length of target_data: {len(target_data)}")
print(f"Number of features: {len(target_data.columns)}")
display(target_data.head(5))