In [1]:
!pip install catboost
!pip install tqdm
!pip install deap
!pip install sklearn

Collecting deap
[?25l  Downloading https://files.pythonhosted.org/packages/af/29/e7f2ecbe02997b16a768baed076f5fc4781d7057cd5d9adf7c94027845ba/deap-1.2.2.tar.gz (936kB)
[K    100% |████████████████████████████████| 942kB 7.2MB/s 
[?25hBuilding wheels for collected packages: deap
  Running setup.py bdist_wheel for deap ... [?25l- \ | / - \ | / - \ | done
[?25h  Stored in directory: /root/.cache/pip/wheels/22/ea/bf/dc7c8a2262025a0ab5da9ef02282c198be88902791ca0c6658
Successfully built deap
Installing collected packages: deap
Successfully installed deap-1.2.2


In [15]:
# sklearn-genetic - Genetic feature selection module for scikit-learn
# Copyright (C) 2016  Manuel Calzolari
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""Genetic algorithm for feature selection"""

import multiprocessing
import random
import numpy as np
from sklearn.utils import check_X_y
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.base import BaseEstimator
from sklearn.base import MetaEstimatorMixin
from sklearn.base import clone
from sklearn.base import is_classifier
from sklearn.model_selection import check_cv
from sklearn.model_selection._validation import _fit_and_score
from sklearn.metrics.scorer import check_scoring
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection.base import SelectorMixin
from sklearn.externals.joblib import cpu_count
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from catboost import CatBoostRegressor

# The essentials
import pandas as pd
import numpy as np

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm_notebook

# Access our Google Drive
from google.colab import drive


creator.create("Fitness", base.Fitness, weights=(1.0, -1.0))
creator.create("Individual", list, fitness=creator.Fitness)

def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

def _evalFunction(individual, X, y, cv):

    individual_sum = np.sum(individual, axis=0)
    if individual_sum == 0:
        return -10000, individual_sum

    X_selected = X.loc[:, np.array(individual, dtype=np.bool)]
    
    scores = []
    for train, test in cv.split(X, y):
        estimator = CatBoostRegressor(iterations=10000, od_type='Iter', 
                                      od_wait=25, objective='MAPE', 
                                      eval_metric='MAPE', task_type='GPU', 
                                      random_seed=2018)
        X_train = X_selected.iloc[train, :]
        X_test = X_selected.iloc[test, :]
        y_train = y.iloc[train]
        y_test = y.iloc[test]

        train_idx = np.random.choice(X_train.index, replace=False, size=int(0.9 * len(X_train)))
        val_idx = list(set(X_train.index) - set(train_idx))

        X_val = X_train.loc[val_idx, :]
        y_val = y_train.loc[val_idx]
        X_train = X_train.loc[train_idx, :]
        y_train = y_train.loc[train_idx]

        estimator.fit(X_train, y_train, eval_set=(X_val, y_val), logging_level='Silent')

        scores.append(custom_mape(np.exp(estimator.predict(X_test)), np.exp(y_test.values).flatten()))
        
        del estimator

    scores_mean = np.mean(scores)
    
    
    return scores_mean, individual_sum


class GeneticSelectionCV(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
    """Feature selection with genetic algorithm.

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - An object to be used as a cross-validation generator.
        - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    fit_params : dict, optional
        Parameters to pass to the fit method.

    verbose : int, default=0
        Controls verbosity of output.

    n_jobs : int, default 1
        Number of cores to run in parallel.
        Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
        to number of cores.

    n_population : int, default=300
        Number of population for the genetic algorithm.

    crossover_proba : float, default=0.5
        Probability of crossover for the genetic algorithm.

    mutation_proba : float, default=0.2
        Probability of mutation for the genetic algorithm.

    n_generations : int, default=40
        Number of generations for the genetic algorithm.

    crossover_independent_proba : float, default=0.1
        Independent probability of crossover for the genetic algorithm.

    mutation_independent_proba : float, default=0.05
        Independent probability of mutation for the genetic algorithm.

    tournament_size : int, default=3
        Tournament size for the genetic algorithm.

    caching : boolean, default=False
        If True, scores of the genetic algorithm are cached.

    Attributes
    ----------
    n_features_ : int
        The number of selected features with cross-validation.

    support_ : array of shape [n_features]
        The mask of selected features.

    generation_scores_ : array of shape [n_generations]
        The maximum cross-validation score for each generation.

    Examples
    --------
    An example showing genetic feature selection.

    >>> import numpy as np
    >>> from sklearn import datasets, linear_model
    >>> from genetic_selection import GeneticSelectionCV
    >>> iris = datasets.load_iris()
    >>> E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))
    >>> X = np.hstack((iris.data, E))
    >>> y = iris.target
    >>> estimator = linear_model.LogisticRegression()
    >>> selector = GeneticSelectionCV(estimator, cv=5)
    >>> selector = selector.fit(X, y)
    >>> selector.support_ # doctest: +NORMALIZE_WHITESPACE
    array([ True  True  True  True False False False False False False False False
           False False False False False False False False False False False False], dtype=bool)
    """
    def __init__(self, cv=None, scoring=None, fit_params=None, verbose=0, n_jobs=1,
                 n_population=300, crossover_proba=0.5, mutation_proba=0.2, n_generations=40,
                 crossover_independent_proba=0.1, mutation_independent_proba=0.05,
                 tournament_size=3, caching=False):
        self.cv = cv
        self.scoring = scoring
        self.fit_params = fit_params
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.n_population = n_population
        self.crossover_proba = crossover_proba
        self.mutation_proba = mutation_proba
        self.n_generations = n_generations
        self.crossover_independent_proba = crossover_independent_proba
        self.mutation_independent_proba = mutation_independent_proba
        self.tournament_size = tournament_size
        self.caching = caching
        self.scores_cache = {}

    def fit(self, X, y):
        """Fit the GeneticSelectionCV model and then the underlying estimator on the selected
           features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values.
        """
        return self._fit(X, y)

    def _fit(self, X, y):
        #X, y = check_X_y(X, y, "csr")
        # Initialization
        n_features = X.shape[1]
        cv = check_cv(self.cv, y, True)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, X=X, y=y, cv=cv)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")
            
        _, log = algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba,
                                     mutpb=self.mutation_proba, ngen=self.n_generations,
                                     stats=stats, halloffame=hof)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]

        self.generation_scores_ = np.array([score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self

    @if_delegate_has_method(delegate='estimator')
    def predict(self, X):
        """Reduce X to the selected features and then predict using the
           underlying estimator.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        y : array of shape [n_samples]
            The predicted target values.
        """
        return self.estimator_.predict(self.transform(X))

    @if_delegate_has_method(delegate='estimator')
    def score(self, X, y):
        """Reduce X to the selected features and then return the score of the
           underlying estimator.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        y : array of shape [n_samples]
            The target values.
        """
        return self.estimator_.score(self.transform(X), y)

    def _get_support_mask(self):
        return self.support_

    @if_delegate_has_method(delegate='estimator')
    def decision_function(self, X):
        return self.estimator_.decision_function(self.transform(X))

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.estimator_.predict_proba(self.transform(X))

    @if_delegate_has_method(delegate='estimator')
    def predict_log_proba(self, X):
        return self.estimator_.predict_log_proba(self.transform(X))




In [13]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_phase_1.csv
test_features_per_phase_2.csv
test_features_per_phase_3.csv
test_features_per_phase_6.csv
test_features_per_phase_7.csv
test_features_per_phase_8.csv
test_values.csv
train_features_14.csv
train_features_15.csv
train_features_1.csv
train_features_2.csv
train_features_3.csv
train_features_6.csv
train_features_7.csv
train_features_8.csv
train_features_adv_14.csv
train_features_adv_15.csv
train_features_adv_1.csv
train_features_adv_2.csv
train_features_adv_3.csv
train_features_adv_6.csv
train_features_adv_7.csv
train_features_adv_8.csv
train_features_per_phase_14.csv
train_features_per_phase_15.csv
train_features_per_phase_1.csv
train_features_per_phase_2.csv
train_features_per_phase_3.csv
train_features_per_phase_

In [0]:
from catboost import CatBoostRegressor
import os

mapes = {}
prediction_df = None
for process_comb, augment in zip([15, 3, 7, 1, 8, 2, 6, 14], [True, False, True, False, True, False, False, False]):
    test_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_per_phase_{}.csv'.format(process_comb), index_col=['process_id'])
    train_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_per_phase_{}.csv'.format(process_comb), index_col=['process_id'])
    train_features_idx = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
    val_features_idx = pd.read_csv('/content/drive/My Drive/Rinse Over Run/val_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
    
    val_features = train_features.loc[val_features_idx]
    train_features = train_features.loc[train_features_idx]
    
    if augment and os.path.exists('/content/drive/My Drive/Rinse Over Run/tsfresh_features_{}.csv'.format(process_comb)):
      ts_fresh_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/tsfresh_features_{}.csv'.format(process_comb), index_col=['id'])
      train_features = train_features.merge(ts_fresh_features, left_index=True, right_index=True)
      val_features = val_features.merge(ts_fresh_features, left_index=True, right_index=True)
      test_features = test_features.merge(ts_fresh_features, left_index=True, right_index=True)
    
    #train_features = train_features.sample(50)
    #val_features = val_features.sample(50)
    #test_features = test_features.sample(50)
    
    phases = {
        15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
        3:  ['pre_rinse', 'caustic'],
        7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
        1:  ['pre_rinse'],
        8:  ['acid'],
        2:  ['caustic'],
        6:  ['caustic', 'intermediate_rinse'],
        14: ['caustic', 'intermediate_rinse', 'acid'],
    }[process_comb]
    
    n_features = {
        15: 125,
        7:  165,
        14: 165,
        6:  250,
        3:  250,
        1:  500,
        8:  750,
        2:  500,
    }
    
    print(set(train_features.columns) - set(test_features.columns))
    for col in set(train_features.columns) - set(test_features.columns):
        train_features = train_features.drop(col, axis=1)
        
    for col in train_features.columns:
        if 'process_id' in col:
            train_features = train_features.drop(col, axis=1)
        
    test_features = test_features[train_features.columns]
    val_features = val_features[train_features.columns]
    
    print(list(train_features.columns))
            
    X_train = train_features
    X_val = val_features
    X_test = test_features
    
    all_labels = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col=['process_id'])
    
    print(X_train.shape, X_val.shape, X_test.shape)
    
    y_train = np.log(all_labels.loc[X_train.index])
    y_val = np.log(all_labels.loc[X_val.index])
    
    selector = GeneticSelectionCV(cv=2,
                                verbose=1,
                                scoring="roc_auc",
                                n_population=10,
                                crossover_proba=0.66,
                                mutation_proba=0.33,
                                n_generations=25,
                                crossover_independent_proba=0.5,
                                mutation_independent_proba=0.05,
                                tournament_size=5,
                                caching=True,
                                n_jobs=1)
    
    selector.fit(X_train, y_train)
    selected_features = X_train.columns[selector.support_]
    print(process_comb, selected_features)
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
    
    cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                        loss_function='MAPE', eval_metric=MAPEMetric())
    cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
    
    val_predictions = np.exp(cat.predict(X_val))
    predictions = np.exp(cat.predict(X_test))
    
    model_mape = custom_mape(val_predictions, np.exp(y_val.values).flatten())
    print(process_comb, model_mape)
    mapes[process_comb] = model_mape

    sub_predictions_df = pd.DataFrame(predictions, columns=['final_rinse_total_turbidity_liter'])
    sub_predictions_df.index = X_test.index
    sub_predictions_df.index.name = X_test.index.name
    
    if prediction_df is None:
        prediction_df = sub_predictions_df
    else:
        prediction_df = pd.concat([prediction_df, sub_predictions_df])

{'pipeline_L12'}
['pipeline_L1', 'pipeline_L10', 'pipeline_L11', 'pipeline_L2', 'pipeline_L3', 'pipeline_L4', 'pipeline_L6', 'pipeline_L7', 'pipeline_L8', 'pipeline_L9', 'num_phases', "('supply_flow', 'min')", "('supply_flow', 'max')", "('supply_flow', 'mean')", "('supply_flow', 'std')", "('supply_flow', 'count')", "('supply_pressure', 'min')", "('supply_pressure', 'max')", "('supply_pressure', 'mean')", "('supply_pressure', 'std')", "('supply_pressure', 'count')", "('return_temperature', 'min')", "('return_temperature', 'max')", "('return_temperature', 'mean')", "('return_temperature', 'std')", "('return_temperature', 'count')", "('return_conductivity', 'min')", "('return_conductivity', 'max')", "('return_conductivity', 'mean')", "('return_conductivity', 'std')", "('return_conductivity', 'count')", "('return_turbidity', 'min')", "('return_turbidity', 'max')", "('return_turbidity', 'mean')", "('return_turbidity', 'std')", "('return_turbidity', 'count')", "('return_flow', 'min')", "('re