## K-Means and Multiple Estimators
### Examine use of K-Means clustering then fitting individual clusters with their own estimators
#### Dylan H. Ross

In [137]:
# setup...
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from numpy import argwhere, array
from itertools import product

from C3SData.data import C3SD

# set an initial pRNG seed, increment for each individual trial  
pRNGs = 1234

### KMCMulti  estimator
An `sklearn` estimator that uses K-means clustering to perform untargeted classification on the dataset, then trains individual estimators for each of the cluster datasets. 

In [139]:
class KMCMulti(BaseEstimator, RegressorMixin):
    """
KMCMulti
    description:
        TODO
"""
    
    def __init__(self, seed=69, n_clusters=3, use_estimator=None, estimator_params=None):
        """
KMCMulti.__init__
    description:
        TODO
    parameters:
        [seed (int)] -- pRNG seed [optional, default=69]
        [n_clusters (int)] -- the number of clusters to fit [optional, default=3]
        [use_estimator (sklearn Regressor)] -- instance of individual estimator to use on each cluster 
                                               [optional, default=None]
        [estimator_params (list(dict(...)))] -- parameters to initialize each estimator with 
                                                [optional, default=None]
"""
        self.seed = seed
        self.n_clusters = n_clusters
        self.use_estimator = use_estimator
        self.estimator_params = estimator_params

        
    def fit(self, X, y):
        """
KMCMulti.fit
    description:
        TODO
    parameters:
        X (array-like) -- features 
        y (array-like) -- targets
    returns: 
        () -- 
"""
        # first fit the KMeans clustering model
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.seed)
        self.kmeans_.fit(X)
        
        # split up the datasets by cluster
        cluster_X = [X[argwhere(self.kmeans_.labels_ == i).ravel()] for i in range(self.n_clusters)]
        cluster_y = [y[argwhere(self.kmeans_.labels_ == i).ravel()] for i in range(self.n_clusters)]
        
        # store the number of samples in each cluster
        self.cluster_sizes_ = [_.shape[0] for _ in cluster_X]
        
        # initialize individual estimators with their associated parameters
        self.estimators_ = [clone(self.use_estimator) for _ in range(self.n_clusters)]
        for est, p in zip(self.estimators_, self.estimator_params):
            est.set_params(**p)
        #self.estimators_ = [self.use_estimator(p) for _, p in zip(range(self.n_clusters), self.estimator_params)]
        
        # fit individual estimators with cluster data
        for est, cx, cy in zip(self.estimators_, cluster_X, cluster_y):
            est.fit(cx, cy)
        
        # return the fitted regressor
        return self
        

    def predict(self, X):
        """
KMCMulti.predict
    description:
        TODO
    parameters:
        X (array-like) -- features
    returns:
        (numpy.ndarray) -- predictions 
"""
        return array([self.estimators_[self.kmeans_.predict(x.reshape(1, -1))[0]].predict(x.reshape(1, -1))[0] for x in X])
    

In [129]:
kmcm = KMCMulti(seed=1234, n_clusters=3, 
                use_estimator=LinearRegression(n_jobs=-1), 
                estimator_params=[{}, {}, {}])

In [130]:
kmcm

KMCMulti(estimator_params=[{}, {}, {}], n_clusters=3, seed=1234,
     use_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False))

In [16]:
d = C3SD('C3S.db', seed=1234)
d.featurize()
d.train_test_split('ccs')
d.center_and_scale()

In [23]:
print(d.N_)
print(d.N_train_)
print(d.N_test_)

6042
4833
1209


In [131]:
kmcm.fit(d.X_train_ss_, d.y_train_)

KMCMulti(estimator_params=[{}, {}, {}], n_clusters=3, seed=1234,
     use_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False))

In [132]:
kmcm.predict(d.X_train_ss_)

array([167.24608459, 171.32420959, 119.27733459, ..., 213.84179688,
       316.67503738, 173.68358459])

Need a helper function to be able to more easily set up the parameter grid for doing grid search hyperparameter optimization. The final parameter grid should look something like this:
```
p_grid = [
    {
        'n_clusters': 2,
        'est_params': [
                        [{...}, {...}], 
                        [{...}, {...}], 
                        ... 
                      ]
    },
    {
        'n_clusters': 3,
        'est_params': [
                        [{...}, {...}, {...}], 
                        [{...}, {...}, {...}], 
                        ... 
                      ]
    },
    ...
]
```

In [157]:
def gen_kmcm_p_grid(n_clusters, est_params):
    """
gen_kmcm_p_grid
    description:
        generates a parameter grid that can be used with GridSearchCV for hyperparameter tuning
    parameters:
        n_clusters (list(int)) -- list of values to try for n_clusters
        est_params (dict(str:list(...))) -- values to try for individual estimator parameters, in
                                            the style of the parameter grid used for GridSearchCV
    returns:
        (list(dict(...))) -- parameter grid for use with GridSearchCV
"""
    # all permutations of the estimator parameters
    perms = []
    keys, values = zip(*est_params.items())
    for v in product(*values):
        perms.append(dict(zip(keys, v)))
        
    print(perms)
 
    # parameter grid
    pg = []
    for nc in n_clusters:
        
        n_perms = [perms for _ in range(nc)]
        
        pg.append({'n_clusters': [nc], 'estimator_params': [list(_) for _ in product(*n_perms)]})
            
            
            
                       
    
    return pg
    

In [158]:
param_grid = gen_kmcm_p_grid([2, 3, 4], {'alpha': [0.1, 0.3, 0.5]})
param_grid

[{'alpha': 0.1}, {'alpha': 0.3}, {'alpha': 0.5}]


[{'n_clusters': [2],
  'estimator_params': [[{'alpha': 0.1}, {'alpha': 0.1}],
   [{'alpha': 0.1}, {'alpha': 0.3}],
   [{'alpha': 0.1}, {'alpha': 0.5}],
   [{'alpha': 0.3}, {'alpha': 0.1}],
   [{'alpha': 0.3}, {'alpha': 0.3}],
   [{'alpha': 0.3}, {'alpha': 0.5}],
   [{'alpha': 0.5}, {'alpha': 0.1}],
   [{'alpha': 0.5}, {'alpha': 0.3}],
   [{'alpha': 0.5}, {'alpha': 0.5}]]},
 {'n_clusters': [3],
  'estimator_params': [[{'alpha': 0.1}, {'alpha': 0.1}, {'alpha': 0.1}],
   [{'alpha': 0.1}, {'alpha': 0.1}, {'alpha': 0.3}],
   [{'alpha': 0.1}, {'alpha': 0.1}, {'alpha': 0.5}],
   [{'alpha': 0.1}, {'alpha': 0.3}, {'alpha': 0.1}],
   [{'alpha': 0.1}, {'alpha': 0.3}, {'alpha': 0.3}],
   [{'alpha': 0.1}, {'alpha': 0.3}, {'alpha': 0.5}],
   [{'alpha': 0.1}, {'alpha': 0.5}, {'alpha': 0.1}],
   [{'alpha': 0.1}, {'alpha': 0.5}, {'alpha': 0.3}],
   [{'alpha': 0.1}, {'alpha': 0.5}, {'alpha': 0.5}],
   [{'alpha': 0.3}, {'alpha': 0.1}, {'alpha': 0.1}],
   [{'alpha': 0.3}, {'alpha': 0.1}, {'alpha': 0.3}],


In [159]:
gs = GridSearchCV(cv=5, 
                  iid=False, 
                  n_jobs=-1, 
                  scoring='neg_mean_squared_error', 
                  param_grid=param_grid,
                  estimator=KMCMulti(seed=1234, use_estimator=Lasso(random_state=1234))
                 )

gs.fit(d.X_train_ss_, d.y_train_)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KMCMulti(estimator_params=None, n_clusters=3, seed=1234,
     use_estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1234,
   selection='cyclic', tol=0.0001, warm_start=False)),
       iid=False, n_jobs=-1,
       param_grid=[{'n_clusters': [2], 'estimator_params': [[{'alpha': 0.1}, {'alpha': 0.1}], [{'alpha': 0.1}, {'alpha': 0.3}], [{'alpha': 0.1}, {'alpha': 0.5}], [{'alpha': 0.3}, {'alpha': 0.1}], [{'alpha': 0.3}, {'alpha': 0.3}], [{'alpha': 0.3}, {'alpha': 0.5}], [{'alpha': 0.5}, {'alpha': 0.1}], [{'alpha'...{'alpha': 0.5}, {'alpha': 0.3}], [{'alpha': 0.5}, {'alpha': 0.5}, {'alpha': 0.5}, {'alpha': 0.5}]]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

In [160]:
gs.best_params_

{'estimator_params': [{'alpha': 0.1}, {'alpha': 0.1}, {'alpha': 0.1}],
 'n_clusters': 3}

Nice, seems to be working well... Now I will just move it to its own module so that I can use it in the prediction performance work.