In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [6]:
from scipy.stats import norm
from scipy.stats import multivariate_normal as mn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [5]:
def clean_data(df):
    # Setting the index to date
    df = df.set_index('date')
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)
   
    df.dropna(inplace=True)
    # Removing outliers
    df = df[df['total_sales'] < df['total_sales'].quantile(0.95)]
    df = df[df['total_revenue'] < df['total_revenue'].quantile(0.95)]
    
    return df

In [4]:
# Method for creating date features
def create_date_features(df):
    df = df.copy()
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    df['weekofyear'] = df['weekofyear'].astype(int)
    return df

In [5]:
# Method for creating lag features with 7, 14, 28, and 364 days
# 364 days was chosen instead of 365 to make it the same day of week
def create_lag_features(df, col):
    lags = [7, 14, 28, 364]
    for l in lags:
        df[f'{col}_lag_{l}'] = df[col].shift(l)
    
    df = df.fillna(0)
    return df

In [6]:
# Method for creating rolling statistics features 
def create_rolling_features(df, target):
    windows = [7, 14, 28, 364]
    for w in windows:
        df[f'rolling_mean_{w}'] = target.rolling(window=w).mean()
        df[f'rolling_median_{w}'] = target.rolling(window=w).median()
        df[f'rolling_min_{w}'] = target.rolling(window=w).min()
        df[f'rolling_max_{w}'] = target.rolling(window=w).max()
        df[f'rolling_std_{w}'] = target.rolling(window=w).std()
    
    df = df.fillna(0)
    return df

In [11]:
def one_hot_encode(df, column):
    ohe = OneHotEncoder(sparse=False)
    encoded_features = ohe.fit_transform(df[[column]])

    categories = ohe.categories_[0]
    column_names = [f"{column}_{category}" for category in categories]
    encoded_df = pd.DataFrame(encoded_features, columns=column_names)
    
    index = df.index
    df.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)
    
    df = pd.concat([df, encoded_df], axis=1)

    df.index = index
    return df

In [6]:
# Random Forest Regressor Model
# Returns the model and predictions
def model_rf(X, Y, training_size):
    X = X.dropna()
    Y = Y.dropna()
    X_train, X_test = X[:training_size], X[training_size:]
    Y_train, Y_test = Y[:training_size], Y[training_size:]
    rf = RandomForestRegressor(random_state=42)
    rf_fit = rf.fit(X_train, Y_train)
    rf_pred = rf.predict(X_test)
    return rf_fit, rf_pred, Y_test

In [7]:
# XGBoost Regressor Model
# Returns the model and predictions
def model_xgb_regressor(X,Y, training_size):
    X = X.dropna()
    Y = Y.dropna()
    X_train, X_test = X[:training_size], X[training_size:]
    Y_train, Y_test = Y[:training_size], Y[training_size:]
    xgb = XGBRegressor(random_state = 42)
    xgb_fit = xgb.fit(X_train, Y_train)
    xgb_pred = xgb_fit.predict(X_test)
    return xgb_fit, xgb_pred, Y_test

In [8]:
# LightGBM Regressor Model
# Returns the model and predictions
def model_lgbm_regressor(X,Y, training_size):
    X = X.dropna()
    Y = Y.dropna()
    X_train, X_test = X[:training_size], X[training_size:]
    Y_train, Y_test = Y[:training_size], Y[training_size:]
    lgbm = LGBMRegressor(random_state=42)
    lgbm_fit = lgbm.fit(X_train, Y_train)
    lgbm_pred = lgbm.predict(X_test)
    return lgbm_fit, lgbm_pred, Y_test

In [13]:
# AdaBoost Regressor Model
# Returns the model and predictions
def model_ab_regressor(X,Y, training_size):
    X = X.dropna()
    Y = Y.dropna()
    X_train, X_test = X[:training_size], X[training_size:]
    Y_train, Y_test = Y[:training_size], Y[training_size:]
    ab = AdaBoostRegressor(random_state=42)
    ab_fit = ab.fit(X_train, Y_train)
    ab_pred = ab.predict(X_test)
    return ab_fit, ab_pred, Y_test

In [15]:
# CatBoost Regressor Model
# Returns the model and predictions
def model_catb_regressor(X,Y, training_size):
    X = X.dropna()
    Y = Y.dropna()
    X_train, X_test = X[:training_size], X[training_size:]
    Y_train, Y_test = Y[:training_size], Y[training_size:]
    catb = CatBoostRegressor(random_state=42)
    catb_fit = catb.fit(X_train, Y_train)
    catb_pred = catb.predict(X_test)
    return catb_fit, catb_pred, Y_test

In [4]:
# Method for plotting predictions
def plot_predictions(Y_test, predictions):
    plt.figure(figsize=(8, 4))
    plt.plot(predictions, Y_test, alpha=0.3)
    plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Predictions')
    plt.show()

In [10]:
def plot_timeseries_pred(Y_train, Y_test, predictions):
    plt.figure(figsize=(15, 6))
    plt.plot(Y_train.index, Y_train, label='Train', color='green')
    plt.plot(Y_test.index, Y_test, label='Test', color='red')
    plt.plot(Y_test.index, predictions, label='Predicted', color='blue')
    plt.xlabel('Date')
    plt.ylabel('Target')
    plt.title('Target Actual vs. Predicted')
    plt.legend()
    plt.show()

In [10]:
# Method for calculating metrics
# Returns metrics in a dataframe
def get_metrics(Y_test, prediction):
    rmse = mean_squared_error(Y_test, prediction, squared=False)
    r2 = r2_score(Y_test, prediction)
    metrics = pd.DataFrame([[rmse, r2]], columns=['RMSE', 'R2'])
    return metrics

In [1]:
def pipe(features, target, train_size):
    X_train, X_test = features[:train_size], features[train_size:]
    Y_train, Y_test = target[:train_size], target[train_size:]
    
    models = [XGBRegressor(random_state=42), LGBMRegressor(random_state=42),
          AdaBoostRegressor(random_state=42), CatBoostRegressor(random_state=42)]
    model_names = ['XGBoost', 'LGBM', 'AdaBoost', 'CatBoost']

    pipe_rmse = []
    pipe_r2 = []
    
    for model in models:
        pipe = Pipeline([
            ('scaler', RobustScaler()),
            ('models', model)
        ])
        
        pipe.fit(X_train, Y_train)
        pipe_pred = pipe.predict(X_test)
        pipe_rmse.append(mean_squared_error(Y_test, pipe_pred, squared=False))
        pipe_r2.append(r2_score(Y_test, pipe_pred))
       
    df_pipeline = pd.DataFrame({'Model': model_names, 'RMSE': pipe_rmse, 'R2': pipe_r2}).sort_values(by='RMSE')
    return df_pipeline

In [1]:
def build_all_models(list_df_names, list_target_cols):
    model_metrics = []
    for df_name in list_df_names:
        df = globals()[df_name].copy()
        df = clean_data(df)
        df = create_date_features(df)
        for label in list_target_cols:
            df = create_lag_features(df, label)
            df = create_rolling_features(df, df[label])
            
            features = df.drop(list_target_cols, axis=1)
            target = df[label]
            train_s = round(len(df) * 0.80)
            
            df_pipeline = pipe(features, target, train_s)
            
            df_pipeline['DataFrame'] = df_name
            df_pipeline['Target'] = label
            model_metrics.append(df_pipeline)
            print('model completed')
    return model_metrics