In [21]:
import pandas as pd
import numpy as np
import importlib

from sklearn.datasets import fetch_california_housing

import sys
sys.path.append('c:/Users/User/Documents/User/Work/Admiral/Pipeline_Builder/pipelineconstruction/src')

# Numeric Encoding

The following code will be useful in Numeric Encodings 

In [17]:
#Read in metadata sheet
control_sheet = pd.read_csv("../controls/california_housing_control_sheet.csv")

Get sciket learn dataset

In [18]:
#Get sciket learn dataset
california_housing_data = fetch_california_housing(return_X_y=False)
X_df = pd.DataFrame(california_housing_data.data, columns=california_housing_data.feature_names)
y = california_housing_data.target


In [19]:
X_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Create pipeline from Metadata

In [22]:
# Import standard functions
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.impute import MissingIndicator

In [23]:
# Import custom functions
from transformers.IdentifyUnknowns import IdentifyUnknowns
from transformers.NumericCapping import NumericCapping


In [24]:
importlib.reload(sys.modules[IdentifyUnknowns.__module__])
importlib.reload(sys.modules[NumericCapping.__module__])

<module 'transformers.NumericCapping' from 'c:/Users/User/Documents/User/Work/Admiral/Pipeline_Builder/pipelineconstruction/src\\transformers\\NumericCapping.py'>

In [25]:

#Check columns exist
needed_cols = ["FeatureName", "TransformedName", "Include", "Raw", "Missing_Values", \
               "Numeric_Unk_Max", "Numeric_Cap_Max", "Numeric_Cap_Min", "Numeric_Unk_Min", \
                "Impute_Strategy", "Impute_Value"]
for need_col in needed_cols:
    if need_col not in control_sheet.columns:
        raise ValueError(f"{need_col} must be in control_sheet.columns")

# Loop over all columns
for ii in range(0, control_sheet.shape[0]):
    if ii == 0: # create output list
        col_transormer_list = []

    # Check values for boolen columns
    for check_col in ["Include", "Raw", "Shadow_Col"]:
        if control_sheet.iloc[ii][check_col] not in ["Y", "N", ""]:
            raise ValueError(f'for feature-{in_feat_ii} column-{check_col} must be in ["Y", "N", ""]. Is  {control_sheet.iloc[ii][check_col]}')

    # Only include Features which are to include
    if control_sheet.iloc[ii]["Include"]=="Y" and control_sheet.iloc[ii]["Raw"]=="Y":
        in_feat_ii = control_sheet.iloc[ii]["FeatureName"]
        out_feat_ii = control_sheet.iloc[ii]["TransformedName"]
        
        # Start with a empy list (no transformations)
        feature_union_list = []

        # Removing known missings ---------------------------------------------------
        # Extract string of know missing
        str_unks = control_sheet.iloc[0]["Missing_Values"].split(";")
        num_unks = [float(unk) for unk in str_unks]

        # If there are some values of known missings (e.g. -1) add transformer to replace them with NAs
        if len(num_unks) > 0:
            feature_union_list.append(("unk_levels", IdentifyUnknowns(unk_levels=[num_unks])))


        # Applying caps and collars to values ---------------------------------------------------
        # Read in caps
        unk_max = control_sheet.iloc[ii]["Numeric_Unk_Max"]
        cap_max = control_sheet.iloc[ii]["Numeric_Cap_Max"]
        cap_min = control_sheet.iloc[ii]["Numeric_Cap_Min"]
        unk_min = control_sheet.iloc[ii]["Numeric_Unk_Min"]
        # Cheak if any aren't NA. If so add capping transformer
        if unk_max==unk_max or cap_max==cap_max or cap_min==cap_min or cap_max==cap_max: 
            feature_union_list.append(("capping", NumericCapping(unk_max=[unk_max], cap_max=[cap_max], cap_min=[cap_min], unk_min=[unk_min])))

        # Impute missing values -------------------------------------------------------
        # Impute missing values
        Impute_Strategy = control_sheet.iloc[ii]["Impute_Strategy"]
        impute_value = float(control_sheet.iloc[ii]["Impute_Value"])
        
        # Check value of Impute_Strategy
        if Impute_Strategy not in ["mean", "median", "most_frequent", "constant"]:
            raise ValueError(f'For feature-{in_feat_ii} Impute Stratergy must be in ["mean", "median", "most_frequent", "constant"] is {Impute_Strategy}')
        # Check constant value given if needed
        if Impute_Strategy=="constant" and impute_value!=impute_value:
            raise ValueError(f'For feature-{in_feat_ii} if Impute Stratergy is "constant", impute_value can not be NA.')    

        feature_union_list_no_impute = feature_union_list.copy() # needed to get shadow matrix
        feature_union_list.append(("impute", SimpleImputer(missing_values=np.nan, strategy=Impute_Strategy, fill_value=impute_value)))

        # Combine all features -------------------------------------------------------

        if control_sheet.iloc[ii]["Shadow_Col"]=="Y":
            # If shadow column needed as second pipeline to give extra column
            pre_col_ii = FeatureUnion([(out_feat_ii, Pipeline(feature_union_list)), \
                                       (out_feat_ii + "_NA", Pipeline([(out_feat_ii + "_imp", Pipeline(feature_union_list_no_impute)), \
                                                                       (out_feat_ii + "_shadow", MissingIndicator(missing_values=np.nan, features="all"))]) \
                                       ) \
                                      ])
        else:
            # If shadow column not needed use existing pipeline
            pre_col_ii = Pipeline(feature_union_list)
        
        # Add column to transformation list
        col_transormer_list.append((out_feat_ii, pre_col_ii, [in_feat_ii]))

In [26]:
X_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [27]:
col_trans = ColumnTransformer(col_transormer_list)
X_out = col_trans.fit_transform(X=X_df)

In [29]:
from transformers.names_from_ColumnTransformer import names_from_ColumnTransformer  

X_out_df = pd.DataFrame(X_out,columns=names_from_ColumnTransformer(column_transformer=col_trans))
X_out_df.head()


Unnamed: 0,MedInc,HouseAge,HouseAge_NA,AveRooms,AveRooms_NA,AveBedrms,AveBedrms_NA,Population,Population_NA,AveOccup,AveOccup_NA,Latitude,Longitude
0,8.0,41.0,0.0,3.0,1.0,1.02381,0.0,500.0,0.0,2.555556,0.0,37.88,-122.23
1,8.0,30.0,0.0,3.0,1.0,0.97188,0.0,2000.0,0.0,2.109842,0.0,37.86,-122.22
2,7.2574,33.0,1.0,3.0,1.0,1.073446,0.0,500.0,0.0,2.80226,0.0,37.85,-122.24
3,5.6431,33.0,1.0,3.0,1.0,1.073059,0.0,558.0,0.0,2.547945,0.0,37.85,-122.25
4,4.0,33.0,1.0,3.0,1.0,1.081081,0.0,565.0,0.0,2.181467,0.0,37.85,-122.25


In [257]:
import types

In [263]:
def get_feature_names(self):
    return(names_from_ColumnTransformer(column_transformer=self))

In [264]:
col_trans.get_feature_names = types.MethodType(get_feature_names, col_trans)

In [265]:
col_trans.get_feature_names()

['MedInc',
 'HouseAge',
 'HouseAge_NA',
 'AveRooms',
 'AveRooms_NA',
 'AveBedrms',
 'AveBedrms_NA',
 'Population',
 'Population_NA',
 'AveOccup',
 'AveOccup_NA',
 'Latitude',
 'Longitude']

# Get Column Names

In [211]:
from names_from_ColumnTransformer import names_from_ColumnTransformer

In [210]:
importlib.reload(sys.modules[names_from_ColumnTransformer.__module__])

<module 'names_from_ColumnTransformer' from 'c:\\Users\\User\\Documents\\User\\Work\\Admiral\\Pipeline_Builder\\names_from_ColumnTransformer.py'>

In [212]:
names_from_ColumnTransformer(column_transformer=col_trans)

['MedInc', 'HouseAge_IMP', 'HouseAge_NA']

In [193]:
len(col_trans.named_transformers_["HouseAge"].transformer_list[0])

2

# Working Functions

In [54]:
control_sheet

Unnamed: 0,FeatureName,TransformedName,Include,Raw,Numeric,Numeric_Unk_Max,Numeric_Cap_Max,Numeric_Cap_Min,Numeric_Unk_Min,Missing_Values,Shadow_Col,Impute_Stratergy,Impute_Value
0,MedInc,MedInc,Y,Y,Y,9.0,8.0,4.0,3.0,-1; -2,N,median,
1,HouseAge,HouseAge,Y,Y,Y,50.0,45.0,30.0,20.0,,Y,most_frequent,
2,AveRooms,AveRooms,N,Y,Y,,,,,,,,
3,AveBedrms,AveBedrms,N,Y,Y,,,,,,,,
4,Population,Population,N,Y,Y,,,,,,,,
5,AveOccup,AveOccup,N,Y,Y,,,,,,,,
6,Latitude,Latitude,N,Y,Y,,,,,,,,
7,Longitude,Longitude,N,Y,Y,,,,,,,,


In [None]:
# Categorical Encodings

Here we will build some functions which will be useful for categorical and ordernal factors

In [184]:
cat_df = pd.DataFrame(np.random.choice(np.array(["huge", "big", "small", "tiny"]), size=20), columns=["size"])
cat_df["fruit"] = np.random.choice(np.array(["apple", "bannana", "cucumber", "dill"]), size=20)
cat_df["value_int"] = np.random.choice(np.array([1, 2, 3 , 4, 5, 6, 7, 9]), size=20)
cat_df["value_float"] = np.random.choice(np.array([1., 2., 3. , 4., 5., 6., 7., 0.9]), size=20)
cat_df

Unnamed: 0,size,fruit,value_int,value_float
0,tiny,bannana,9,6.0
1,small,cucumber,6,5.0
2,big,cucumber,1,3.0
3,big,cucumber,2,1.0
4,big,cucumber,5,1.0
5,small,dill,4,3.0
6,tiny,apple,7,4.0
7,huge,dill,5,3.0
8,big,apple,1,0.9
9,huge,bannana,9,7.0


In [189]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import _deprecate_positional_args

class CategoricalCatLimit(BaseEstimator, TransformerMixin):

    def __init__(self, cat_num=None, other_value=None):
        
        if cat_num is not None and other_value is not None:
            if len(cat_num) != len(other_value):
                raise ValueError("length of cat_num and other_value must be the same") 

        self.cat_num = cat_num
        self.other_value = other_value
        self.allowed_value = []
        

    def fit(self, X, y=None):
        
        X_=X.copy()

        # If input is null set to default
        if self.cat_num is None:
            self.cat_num = [10 for ii in range(0,X_.shape[1])]
        else:
            if X_.shape[1] != len(self.cat_num):
                raise ValueError("Number of columns in X must match the length of cat_num") 

        # If null start with empty list
        if self.other_value is None:
            self.other_value = []
        else:
            if X_.shape[1] != len(self.cat_num):
                raise ValueError("Number of columns in X must match the length of other_value")  

        for ii in range(0,X_.shape[1]):
            unique, counts = np.unique(X_.iloc[:,ii], return_counts=True)
            sorted_list = {k: v for k, v in sorted(dict(zip(unique, counts)).items(), key=lambda item: item[1], reverse=True)}

            self.allowed_value.append([*sorted_list][0:self.cat_num[ii]])

            
            if X_.iloc[:,ii].dtype in ['O','S'] and len(self.other_value)<=ii:
                self.other_value.append("other")
            elif X_.iloc[:,ii].dtype in ['I','F', "int32", "float64"] and len(self.other_value)<=ii:
                self.other_value.append(-1)
            elif len(self.other_value)<=ii:
                raise ValueError(f"Data type of {X_.iloc[:,ii].dtype} not know")

    def transform(self, X, y=None):
        X_=X.copy()

        if isinstance(X, np.ndarray):
            for col_ii in range(0,X_.shape[1]): 
                logic = [v not in self.allowed_value[col_ii] for v in X.iloc[:,col_ii].values]
                X_[logic, col_ii] = self.other_value[col_ii]
        
        elif isinstance(X, pd.DataFrame):

            for col_ii in range(0, X_.shape[1]):
                logic = [v not in self.allowed_value[col_ii] for v in X.iloc[:,col_ii].values]

                X_.iloc[logic, col_ii] = self.other_value[col_ii]

        return X_




In [190]:
cat_limit = CategoricalCatLimit(cat_num=[2,3,4,4])
cat_limit.fit(cat_df)
print(cat_limit.allowed_value)
cat_limit.transform(cat_df)

[['huge', 'small'], ['cucumber', 'bannana', 'apple'], [5, 1, 6, 9], [3.0, 5.0, 2.0, 6.0]]


Unnamed: 0,size,fruit,value_int,value_float
0,other,bannana,9,6.0
1,small,cucumber,6,5.0
2,other,cucumber,1,3.0
3,other,cucumber,-1,-1.0
4,other,cucumber,5,-1.0
5,small,other,-1,3.0
6,other,apple,-1,-1.0
7,huge,other,5,3.0
8,other,apple,1,-1.0
9,huge,bannana,9,-1.0


# Main wrapped up function

In [1]:
from construction.BuildNumericPipeline import BuildNumericPipeline

ModuleNotFoundError: No module named 'construction'

In [321]:
importlib.reload(sys.modules[BuildNumericPipeline.__module__])

<module 'BuildNumericPipeline' from 'c:\\Users\\User\\Documents\\User\\Work\\Admiral\\Pipeline_Builder\\BuildNumericPipeline.py'>

In [322]:
col_tran_main = BuildNumericPipeline(control_sheet)

In [331]:
X_out = col_tran_main.fit_transform(X_df)
X_df_out = pd.DataFrame(X_out, columns=names_from_ColumnTransformer(col_tran_main))
X_df_out.head()

Unnamed: 0,MedInc,HouseAge,HouseAge_NA,AveRooms,AveRooms_NA,AveBedrms,AveBedrms_NA,Population,Population_NA,AveOccup,AveOccup_NA,Latitude,Longitude
0,8.0,41.0,0.0,3.0,1.0,1.02381,0.0,500.0,0.0,2.555556,0.0,37.88,-122.23
1,8.0,30.0,0.0,3.0,1.0,0.97188,0.0,2000.0,0.0,2.109842,0.0,37.86,-122.22
2,7.2574,33.0,1.0,3.0,1.0,1.073446,0.0,500.0,0.0,2.80226,0.0,37.85,-122.24
3,5.6431,33.0,1.0,3.0,1.0,1.073059,0.0,558.0,0.0,2.547945,0.0,37.85,-122.25
4,4.0,33.0,1.0,3.0,1.0,1.081081,0.0,565.0,0.0,2.181467,0.0,37.85,-122.25


In [333]:
col_tran_main.get_feature_names()

AttributeError: Transformer MedInc (type Pipeline) does not provide get_feature_names.