In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
#Custom Transformer that convert only MSSubClass column to string
class MSSubClassConvert(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, drop=[]):
        self.drop=drop
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        X['MS SubClass']=X['MS SubClass'].astype(str)
        return X

In [3]:
#Custom Transformer that extracts drop correlated columns as identified
class DropCorrelated(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, drop=[]):
        self.drop=drop
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X.drop(columns=self.drop)

In [4]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names=None,by=None):
        '''
        feature_names: list of features to pick for. If feautres_names is not None, 'by' will be ignored.
        by: 'categorical' or 'numerical'. The type of features to pick out for. Only used if feature_names is None.
        '''
        self.feature_names = feature_names
        self.by=by
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        if self.feature_names is not None:
#             print(type(X))
            return X[self.feature_names] 
        else:
            if self.by=="categorical":
                return X.select_dtypes(exclude='number')
            elif self.by=="numerical":
                return X.select_dtypes(include='number')
            else:
                print(f"Error: Expected 'categorical' or 'numerical', but got ''{self.by}'.")
                return None

In [5]:
#Custom Transformer that imputes with None or 0
class StandardImpute(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, none=[], zero=[], mode=[]):
        '''
        zero:list of cols to impute to 0
        none: list of cols to impure to None
        '''
        self.zero = zero
        self.none = none
        self.mode = mode
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        X_cols=list(X.columns)
        
        for i in [z for z in self.none if z in X_cols]:
            X[i]=X[i].fillna("None")
            
        for j in [z for z in self.zero if z in X_cols]:
            X[j]=X[j].fillna(0)
        
        for k in [z for z in self.mode if z in X_cols]:
            X[k]=X[k].fillna(X[k].mode()[0])
        
        return X 

In [6]:
#Custom Transformer that imputes Na with 0
class ImputeZero(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self):
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
#         X['Lot Frontage']=X.groupby(by="Neighborhood")['Lot Frontage'].transform(
#             lambda z: z.fillna(z.median() if not np.isnan(z.median()) else df['Lot Frontage'].median()))
        
        
        return X.fillna(0)

In [7]:
#Custom Transformer that imputes Lot Frontage
class LotFrontageImpute(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self):
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        X['Lot Frontage']=X.groupby(by="Neighborhood")['Lot Frontage'].transform(
            lambda z: z.fillna(z.median() if not np.isnan(z.median()) else df['Lot Frontage'].median()))
        
#         print(type(X))
        return X

In [8]:
#Custom Transformer that transforms ordinal features to numerica
class OrdinalToNumeric(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self):
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        X=ordinal_to_rank(X)
        
        return X

In [9]:
#Custom Transformer that Align Predict features to same as Trained on
class AlignTrainPredict(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self,feature_names):
        self.feature_names=feature_names
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
#         self.predict_features=list(X.columns)
#         print(len(self.predict_features))
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        #keep predict features only if they are in the list
#         print(X.shape)
        list_to_retain=[z for z in self.feature_names if z in X.columns]
        X=X[list_to_retain]
#         print(X.shape)
        
        #insert features into train if they do not yet exist
#         print(X.shape)
        list_of_missing_feat=[z for z in self.feature_names if z not in X.columns]
        for i in list_of_missing_feat:
            X.loc[:,i]=0
            
            
        #reset sequence of columns
        X=X[self.feature_names]
        
#         print(X.shape)
#         display(X.head())
        
        return X

In [10]:
#Custom Transformer that one hot encode categorical features
class OneHotEncode(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self):
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        
        #look at categorical columns
        df_c=X.select_dtypes(exclude='number')

        #look at numerical columns
        df_n=X.select_dtypes(include='number')

        df_c=pd.get_dummies(df_c,drop_first=True)

        df_new=pd.concat([df_n,df_c],axis=1)
        
#         display(df_new.head())
#         display(df_new.info(verbose=True))

        return df_new

In [11]:
#Custom Transformer that does nothing but acts passes information out of the pipeline.
class Passthrough(BaseEstimator, TransformerMixin):
#     features=None
    #Class Constructor 
    def __init__(self):
        
        return None
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
#         print("fitted",len(self.features))
#         self.list_features=list(X.columns)
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
#         self.features=list(X.columns)
#         display(X.dtypes)
#         display(X.head())

        self.X=X
        return X
    
    def get_feature_names(self):
        return self.X.columns