In [12]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# Split data into training and testing
# Change date time to hour, day of week, day of year, month
# Split data into NV and V

In [1]:
class RandomForestModel:
    def __init__(self, data, targets):
        """
        Initialize the RandomForestModel.

        Parameters:
        - data: The dataset for modeling.
        - targets: The target variables to predict.
        """
        self.data = data
        self.targets = targets
        self.model = None
        self.wmape_output = None
        self.best_params = None
        self.best_score = None
        self.fit = None
        self.wmape = None
        self.pred_wmape = None
        self.pred = None

    def update_targets(self, targets):
        """
        Update the target variables for modeling.

        Parameters:
        - targets: The new target variables to set.
        """
        self.targets = targets

    def update_data(self, data):
        """
        Update the dataset for modeling.

        Parameters:
        - data: The new dataset to set.
        """
        self.data = data

    @staticmethod
    def clean_data():
        """
        Clean the dataset (add your data cleaning logic here).
        """
        pass

    def prepare_data(self, targets):
        """
        Prepare the dataset for modeling.

        Parameters:
        - targets: The target variables to prepare the data for.

        Returns:
        - data: The prepared dataset.
        """
        for target in targets:

            # Shift by Date Cycles
            self.data['next_hour'] = self.data[target].shift(-1)    
            self.data['next_weekday'] = self.data.groupby('weekday')[target].agg('sum').shift(-1)
            self.data['next_month'] = self.data.groupby('month')[target].agg('sum').shift(-1)
        
            # Lag by Date Cycles
            self.data["lag_hour"] = self.data[target].shift(1)
            self.data["lag_weekday"] = self.data.groupby('weekday')[target].agg('sum').shift(1)
            self.data["lag_month"] = self.data.groupby('month')[target].agg('sum').shift(1)

            # Diff by Date Cycles
            self.data["hour_diff"] = self.data[target].diff(1)
            self.data["weekday_diff"] = self.data.groupby('weekday')[target].agg('sum').diff(1)
            self.data["month_diff"] = self.data.groupby('month')[target].agg('sum').diff(1)

            self.data = self.data.dropna()

            # Rolling Mean by Date Cycles
            self.data["6hour_mean"] = self.data[target].rolling(6).mean()
            self.data["12hour_mean"] = self.data[target].rolling(12).mean()
            self.data["24hour_mean"] = self.data[target].rolling(24).mean()
            self.data["week_mean"] = self.data[target].rolling(24*7).mean()
            self.data["30day_mean"] = self.data[target].rolling(24*30).mean()

            # Rolling Min by Date Cycles
            self.data["6hour_min"] = self.data[target].rolling(6).min()
            self.data["12hour_min"] = self.data[target].rolling(12).min()
            self.data["24hour_min"] = self.data[target].rolling(24).min()
            self.data["week_min"] = self.data[target].rolling(24*7).min()
            self.data["30day_min"] = self.data[target].rolling(24*30).min()

            # Rolling Max by Date Cycles
            self.data["6hour_max"] = self.data[target].rolling(6).max()
            self.data["12hour_max"] = self.data[target].rolling(12).max()
            self.data["24hour_max"] = self.data[target].rolling(24).max()
            self.data["week_max"] = self.data[target].rolling(24*7).max()
            self.data["30day_max"] = self.data[target].rolling(24*30).max()

            # Rolling Standard Deviation by Date Cycles
            self.data["6hour_std"] = self.data[target].rolling(6).std()
            self.data["12hour_std"] = self.data[target].rolling(12).std()
            self.data["24hour_std"] = self.data[target].rolling(24).std()
            self.data["week_std"] = self.data[target].rolling(24*7).std()
            self.data["30day_std"] = self.data[target].rolling(24*30).std()

            return self.data

    @staticmethod
    def wmape(y_true, y_pred):
        """
        Calculate the Weighted Mean Absolute Percentage Error (WMAPE).

        Parameters:
        - y_true: The true target values.
        - y_pred: The predicted target values.

        Returns:
        - wmape: The WMAPE score.
        """
        return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

    def wmape_over_df(self, targets):
        """
        Calculate WMAPE scores for multiple target variables.

        Parameters:
        - targets: The target variables to calculate WMAPE scores for.

        Returns:
        - wmape_output: List of WMAPE scores and results.
        """
        wmape_output = []

        for target in targets:
            wmape_stat = self.wmape(self.data[f'next_{target}'], self.data[target])
            mean = self.data[target].mean()
            std = self.data[target].std()
            pred = mean * wmape_stat
            result = f'WMAPE Stat - {wmape_stat} | Relative WMAPE Stat - {(pred - mean) / std}'
            wmape_output.append({target: {'wmape_stat': wmape_stat, 'result': result}})

        self.wmape_output = wmape_output
        return wmape_output

    def tune_random_forest_hyperparameters(self, X, y):
        """
        Tune the hyperparameters of a Random Forest model using GridSearchCV.

        Parameters:
        - X: Input features.
        - y: Target variable.

        Returns:
        - best_params: The best hyperparameters found by GridSearchCV.
        - best_score: The best score achieved with the best hyperparameters.
        """
        param_grid = {
            'n_estimators': [50, 100, 150],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt', 'log2']
        }

        rf = RandomForestRegressor(random_state=0, n_jobs=6)
        grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X, y)

        self.best_params = grid_search.best_params_
        self.best_score = grid_search.best_score_

        return self.best_params, self.best_score

    def compare_results(self, yval, features, imputer, mdl):
        Xval = imputer.transform(self.data[features])
        p = mdl.predict(Xval)
        return self.wmape(yval, p), p

    def fit_model(self, exempt):
        features = [feature for feature in self.data.columns if feature not in exempt]
        imputer = SimpleImputer()
        Xtr = imputer.fit_transform(self.data[features])
        ytr = self.data[self.targets]

        best_params, best_score = self.tune_random_forest_hyperparameters(Xtr, ytr)

        if best_params:
            mdl = RandomForestRegressor(n_estimators=best_params.get('n_estimators', 100),
                                       max_depth=best_params.get('max_depth', None),
                                       min_samples_split=best_params.get('min_samples_split', 2),
                                       min_samples_leaf=best_params.get('min_samples_leaf', 1),
                                       max_features=best_params.get('max_features', 'auto'),
                                       random_state=best_params.get('random_state', 0),
                                       n_jobs=best_params.get('n_jobs', 6))
        else:
            mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)

        pred_wmape, pred = self.compare_results(ytr, features, imputer, mdl)

        self.model = mdl
        self.fit = mdl.fit(Xtr, ytr)
        self.wmape = self.wmape_over_df(self.targets)
        self.params = best_params
        self.score = best_score
        self.pred_wmape = pred_wmape
        self.pred = pred

        return self.output()

    def output(self):
        """
        Get the output of the RandomForestModel.

        Returns:
        - output: A dictionary containing model-related information.
        """
        return {
            'model': self.model,
            'fit': self.fit,
            'wmape': self.wmape,
            'params': self.params,
            'score': self.score,
            'pred_wmape': self.pred_wmape,
            'pred': self.pred
        }

In [28]:
datasets = []
models = []
for df in datasets:
    models.append(RandomForestModel(df, ['non-violent', 'violent']))