## Generate Data

In [1]:
import pandas as pd

df = pd.DataFrame({"X":[1,2,3,4,5], 'y':[10,20,30,40,50]})

## Build a Pipeline
https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

### Create Transforms

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class multiplier(BaseEstimator,TransformerMixin):
    "Multiply by Key"
    def __init__(self,weight):
        self.weight = weight
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X*self.weight
    
class divider(BaseEstimator,TransformerMixin):
    "divide by Key"
    def __init__(self,divisor):
        self.divisor = divisor
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X/self.divisor


class concat(BaseEstimator,TransformerMixin):
    """concat sentence"""
    def __init__(self,sentence):
        self.sentence = sentence
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X.apply(lambda x: [self.sentence + str(i) for i in x])
    
    
class proper(BaseEstimator,TransformerMixin):
    """proper caser"""
#     def __init__(self,sentence):
#         self.sentence = sentence
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X.apply(lambda x: [str(i).title() for i in x])
    
    

### Custom Algo

http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/

https://stackoverflow.com/questions/49017257/custom-scoring-on-gridsearchcv-with-fold-dependent-parameter

In [11]:
from sklearn.base import ClassifierMixin

class CustAlgo(BaseEstimator, ClassifierMixin):  
    """An example of classifier"""

    def __init__(self, intValue=0, stringParam="defaultValue", otherParam=None):
        """
        Called when initializing the classifier
        """
        self.intValue = intValue
        self.stringParam = stringParam

        # THIS IS WRONG! Parameters should have same name as attributes
        self.differentParam = otherParam 


    def fit(self, X, y=None):
        """
        This should fit classifier. All the "work" should be done here.

        Note: assert is not a good choice here and you should rather
        use try/except blog with exceptions. This is just for short syntax.
        """

        assert (type(self.intValue) == int), "intValue parameter must be integer"
        assert (type(self.stringParam) == str), "stringValue parameter must be string"
        assert (len(X) == 20), "X must be list with numerical values."

        self.treshold_ = (sum(X)/len(X)) + self.intValue  # mean + intValue

        return self

    def _meaning(self, x):
        # returns True/False according to fitted classifier
        # notice underscore on the beginning
        return( True if x >= self.treshold_ else False )

    def predict(self, X, y=None):
        try:
            getattr(self, "treshold_")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")

        return([self._meaning(x) for x in X])

    def score(self, X, y=None):
        # counts number of values bigger than mean
        return(sum(self.predict(X))) 

### Assemble Pipeline

In [32]:
from sklearn.pipeline import Pipeline

# Two Pipelines
numeric_pipe = Pipeline([('multiplier',multiplier(weight = 10)),('divider',divider(divisor = 100))])
string_pipe = Pipeline([("concat",concat(sentence='i transformed the following = ')),("proper",proper())])

# Union two feature Pipelines
from sklearn.pipeline import FeatureUnion
feats = FeatureUnion([('Numeric', numeric_pipe), ('String', string_pipe)])

# Assemble Main Pipeline with algorithm
main_pipe = Pipeline([
    ('features',feats),('AlgoName',CustAlgo())
])

### Create Custom Scorer

https://github.com/EpistasisLab/tpot/issues/301#issuecomment-258236010

In [53]:
from sklearn.metrics import make_scorer
from numpy import mean
def string_scorer(y_true,y_pred):

    """
    Parameters
    ----------
    y_true: now just a list ..  make ... numpy.ndarray {n_samples}
    True class labels
    y_pred: now just a list ..  make ... numpy.ndarray {n_samples}
    Predicted class labels by the estimator

    ** not used **
    X_used: now just a list ..  make ... numpy.ndarray  {n_samples, n_features_used}
    A numpy matrix containing the training and used features for the
    `individual`'s evaluation

    mean fitness: float
    Returns a float value indicating the `individual`'s accuracy
    -------
    """

    y_true = [len(str(i)) for i in y_true]
    y_pred = [len(str(i)) for i in y_pred]    
    fitness = [y_i - y_pred_i for y_i,y_pred_i in zip(y_true,y_pred)]
    return mean(fitness)

print("Test:", string_scorer(y_true = [43422,3243,432,1], y_pred = [12342,2344,343,1]))

string_scorer = make_scorer(
     string_scorer,
     greater_is_better=False)

Test: 0.0


# Set up Grid Search

In [36]:
list(main_pipe.get_params().keys())

['memory',
 'steps',
 'features',
 'AlgoName',
 'features__n_jobs',
 'features__transformer_list',
 'features__transformer_weights',
 'features__Numeric',
 'features__String',
 'features__Numeric__memory',
 'features__Numeric__steps',
 'features__Numeric__multiplier',
 'features__Numeric__divider',
 'features__Numeric__multiplier__weight',
 'features__Numeric__divider__divisor',
 'features__String__memory',
 'features__String__steps',
 'features__String__concat',
 'features__String__proper',
 'features__String__concat__sentence',
 'AlgoName__intValue',
 'AlgoName__otherParam',
 'AlgoName__stringParam']

In [40]:
from sklearn.model_selection import GridSearchCV

hyperparameters = { 'features__transformer_weights': [3214,234,324],
                    'features__Numeric__divider__divisor': [10,20,30],
                   'features__String__concat__sentence': ["i transformed the following: ", "i did: ", "this is it: "]
                  }

## Execute Grid Search

In [37]:

clf = GridSearchCV(main_pipe, hyperparameters, cv=2,scoring=string_scorer)
# clf.fit(df['x'])