catboost Regression Model Class

In [1]:
# !pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp38-none-win_amd64.whl (68.4 MB)
Collecting graphviz
  Using cached graphviz-0.16-py2.py3-none-any.whl (19 kB)
Collecting plotly
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11434 sha256=1aef1556e32127bc0d1a2cb334272641169bd5fd9ccd2af687a28640334d158c
  Stored in directory: c:\users\chris.dmello\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: graphviz, retrying, plotly, catboost
Successfully installed catboost-0.26 graphviz-0.16 plotly-4.14.3 retrying-1.3.3


In [2]:
import os
import numpy as np
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor


from sklearn import metrics

In [2]:
class catboost_regressor():
    '''
    '''
    
    def __init__(self, param = {}):
        '''
        '''
        
        self._rfr = CatBoostRegressor(**params) ## kwargs  loss_function='RMSE', iterations = 100
        self._param = param
        
    
    @classmethod
    def new_instance(cls, param={}):
        '''
        
        rf_model_best = RF_regressor.new_instance(model_cv.best_params_)
        
        requires none, but if params is passed, it will call the init call and pass params to it,
        '''
        return cls(param)

    @property
    def model(self):
        """
        Getter to return the model created
        :return: handle or instance of the RandomForestReqgressor
        
        Property you can use it as a PARAM
        as in rf_model.model will return the model.
        """
        return self._rfr

    @property
    def params(self):
        """
        Getter for model parameters 
        """
        return self._param
    
    def model_run(self, df, var_dict,cat_features = {}, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        
#         self._rfr  = self._rfr(cat_features) 
## TypeError: 'CatBoostRegressor' object is not callable

        
        self._rfr.fit(X_train, y_train)
        y_pred = self._rfr.predict(X_test)
        
        model_score = self._rfr.score(X_test , y_test)
        
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        msle = metrics.mean_squared_log_error(y_test, y_pred)
        rmsle = np.sqrt(msle)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["msle"] = msle
        return_dict["rmsle"] = rmsle
        return_dict["model"] = self.model
        
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
        return_dict["model_score"] = model_score  ## here it is R2
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    def model_run_log(self, df, var_dict,cat_features = {}, other_dict = {}):
        '''
        We consider the log od predictions
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        
        ## log of predictions
        df[label] = np.log(df[label]+1)
        
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        
#         self._rfr  = self._rfr(cat_features) 
## TypeError: 'CatBoostRegressor' object is not callable

        
        self._rfr.fit(X_train, y_train)
        y_pred = self._rfr.predict(X_test)
        
        #y_pred = np.exp(y_pred_log)
        
        model_score = self._rfr.score(X_test , y_test)
        
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)
        
        msle = metrics.mean_squared_log_error(y_test, y_pred)
        rmsle = np.sqrt(msle)
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["msle"] = msle
        return_dict["rmsle"] = rmsle
        return_dict["model"] = self.model
        
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
        return_dict["model_score"] = model_score  ## here it is R2
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    def model_run_log_complete(self, df, var_dict,cat_features = {}, other_dict = {}):
        '''
        When we do test train split, we loose some part of the data to test, we could use K-fold to counter this, or 
        Train the model on all of the test data. And no train data
        
        We consider the log od predictions
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        
        ## log of predictions
        df[label] = np.log(df[label]+1)
        
        X_train = df[feature]
        y_train = df[label]
        
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        
#         self._rfr  = self._rfr(cat_features) 
## TypeError: 'CatBoostRegressor' object is not callable

        
        self._rfr.fit(X_train, y_train)
        
        
        y_pred = self._rfr.predict(X_train)
        
        #y_pred = np.exp(y_pred_log)
        
        model_score = self._rfr.score(X_train, y_train)
        
        mae = metrics.mean_absolute_error(y_train, y_pred)
        mse = metrics.mean_squared_error(y_train, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_train, y_pred)
        
        msle = metrics.mean_squared_log_error(y_train, y_pred)
        rmsle = np.sqrt(msle)
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["msle"] = msle
        return_dict["rmsle"] = rmsle
        return_dict["model"] = self.model
        
#         return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
        return_dict["model_score"] = model_score  ## here it is R2
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    def model_run_cv(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"], 
        other_dict["cv"]
        
        neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        ## this has to be 2*2 matrix
        
#         self._rfr.fit(X_train, y_train)
#         y_pred = self._rfr.predict(X_test)
        
    
        param_grid = other_dict["parameters"]
        
        # Instantiate the grid search model
        grid_search_ad = GridSearchCV(  estimator = self._rfr, 
                                        param_grid = param_grid, 
                                        scoring = other_dict["scoring"],  ## scoring method
                                        cv = other_dict["cv"],    ## no of cross validation
                                        n_jobs = -1,    ## no of searches in parallel,-1 means, use all resources
                                        verbose = 100)

        grid_search_ad.fit(X_train, y_train)
        y_pred = grid_search_ad.predict(X_test)
        
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)
        
        r2_2 = grid_search_ad.score(X_test, y_test)  ##score method of CV Features, Labels
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
          ## grid search CV doesnt have this
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model_score"] = r2_2
        return_dict["model"] = grid_search_ad
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    
    def model_run_cv_log(self, df, var_dict, other_dict = {}):
        '''
        self : rf catboost model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"], 
        other_dict["cv"]
        
        neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
        
        ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
        
        ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

        CatBoostError: C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/target/
        target_converter.cpp:53: NaN values are not supported for target

        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        
        ## natural log
        df[label] = np.log(df[label] +1)
        
        print(df.isnull().any())
        
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        ## this has to be 2*2 matrix
        
#         self._rfr.fit(X_train, y_train)
#         y_pred = self._rfr.predict(X_test)

        
        # Instantiate the grid search model
        grid_search_ad = GridSearchCV(  estimator  = self._rfr, 
                                        param_grid = other_dict["param_grid"], 
                                        scoring    = other_dict["scoring"],  ## scoring method
                                        cv         = other_dict["cv"],    ## no of cross validation
                                        n_jobs  = -1,    ## no of searches in parallel,-1 means, use all resources
                                        verbose = 0)

        
        grid_search_ad.fit(X_train, y_train)
        
        y_pred = grid_search_ad.predict(X_test)
        
        mae   = metrics.mean_absolute_error(y_test, y_pred)
        mse   = metrics.mean_squared_error(y_test, y_pred)
        msle  = metrics.mean_squared_log_error(y_test, y_pred)
        rmsle = np.sqrt(msle)
        rmse  = np.sqrt(mse)
        r2    = metrics.r2_score(y_test, y_pred)
        r2_2  = grid_search_ad.score(X_test, y_test)  ##score method of CV Features, Labels
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
          ## grid search CV doesnt have this
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["grid_search_ad.score"] = r2_2   ## rf_model.score(test, pred) is same as r2
        return_dict["msle"] = msle
        return_dict["rmsle"] = rmsle
        return_dict["model_score"] = r2_2
        return_dict["model"] = grid_search_ad
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    @staticmethod
    def feature_importance(model, independant_col_list):
        '''
        input :
        cat_model_base_005 - catboost model 
        independant_col_list - list of independant columns
        '''
        
        fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': \
                                independant_col_list})
        fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
        fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
        plt.title('CatBoost - Feature Importance')
        plt.ylabel('Features')
        plt.xlabel('Importance');

    
#----------------------------------------- MLFLOW ----------------------------------------------------------#    
    def model_run_mlfow(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        
        self._rfr.fit(X_train, y_train)
        y_pred = self._rfr.predict(X_test)
        
        model_score = self._rfr.score(X_test , y_test)
        
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
        bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model"] = self.model
        
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
        return_dict["model_score"] = model_score  ## here it is R2
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    
    def model_run__log_mlfow(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df   : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default
        '''
        
        r_name = other_dict["run_name"] 
        with mlflow.start_run(run_name=r_name) as run:

            # get current run and experiment id
            runID = run.info.run_uuid
            experimentID = run.info.experiment_id

            feature = var_dict["independant"]
            label   = var_dict["dependant"]

            ## log of predictions
            df[label] = np.log(df[label]+1)

            X = df[feature]
            y = df[label]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)

            self._rfr.fit(X_train, y_train)
            y_pred = self._rfr.predict(X_test)
            
            ## self.model is a getter for the model
            mlflow.sklearn.log_model(self.model, "catboost-reg-model")
            mlflow.log_params(self.params)

            model_score = self._rfr.score(X_test , y_test)

            mae = metrics.mean_absolute_error(y_test, y_pred)
            mse = metrics.mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = metrics.r2_score(y_test, y_pred)
            
            # Log metrics
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)

    #         errors = abs(y_pred - y_test)
    #         mape = 100 * np.mean(errors / y_test)
    #         accuracy = 100 - mape

#             bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination

            return_dict = {}
            return_dict["mae"] = mae
            return_dict["mse"] = mse
            return_dict["rmse"] = rmse
            return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
            return_dict["model"] = self.model

            return_dict["y_test"] = y_test
            return_dict["y_pred"] = y_pred

            return_dict["model_score"] = model_score  ## here it is R2

    #         return_dict["mape"] = mape
    #         return_dict["accuracy"] = accuracy

            ## TODO when model has no param
    #         return_dict["param"] = self.params  
            
            print("-" * 100)
            print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))
#             print("Estimator trees        :", self.params["n_estimators"])
            print('Mean Absolute Error    :', mae)
            print('Mean Squared Error     :', mse)
            print('Root Mean Squared Error:', rmse)
            print('R2                     :', r2)

            return (experimentID, runID)
    
    def model_run_cv_mlfow(self, df, var_dict, other_dict = {}):
        '''
        self : rf regressor model
        df : dataframe
        var_dict : model variables dict - var_dict["independant"], var_dict["dependant"]
        other_dict : other dict if needed, set to {} default - other_dict["parameters"], other_dict["scoring"], 
        other_dict["cv"]
        
        neg_mean_absolute_error - we have to minimize mae, but sklearn works rf on maximization so we negative this
        '''
        
        feature = var_dict["independant"]
        label   = var_dict["dependant"]
        X = df[feature]
        y = df[label]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 42)
        ## this has to be 2*2 matrix
        
#         self._rfr.fit(X_train, y_train)
#         y_pred = self._rfr.predict(X_test)
        
    
        param_grid = other_dict["parameters"]
        
        # Instantiate the grid search model
        grid_search_ad = GridSearchCV(estimator = self._rfr, param_grid = param_grid, 
                                   scoring = other_dict["scoring"], cv = other_dict["cv"],  
                                   n_jobs = -1, verbose = 2)

        grid_search_ad.fit(X_train, y_train)
        y_pred = grid_search_ad.predict(X_test)
        
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_test, y_pred)
        
        r2_2 = grid_search_ad.score(X_test, y_test)  ##score method of CV Features, Labels
        
#         errors = abs(y_pred - y_test)
#         mape = 100 * np.mean(errors / y_test)
#         accuracy = 100 - mape
        
#         bs = self.model.oob_score_  ## OOB score is same as R2, or co-eff of determination
          ## grid search CV doesnt have this
        
        return_dict = {}
        return_dict["mae"] = mae
        return_dict["mse"] = mse
        return_dict["rmse"] = rmse
        return_dict["r2"] = r2   ## rf_model.score(test, pred) is same as r2
        return_dict["model_score"] = r2_2
        return_dict["model"] = grid_search_ad
        return_dict["y_test"] = y_test
        return_dict["y_pred"] = y_pred
        
#         return_dict["mape"] = mape
#         return_dict["accuracy"] = accuracy
        
        ## TODO when model has no param
#         return_dict["param"] = self.params  
        
        return return_dict
    

    def train_with_all_data():
        '''
        Once we find th best model, train with all data 
        '''
        pass