In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [5]:
from scipy.stats import norm
from scipy.stats import multivariate_normal as mn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression

In [6]:
def clean_data(df):
    # Setting the index to date
    df = df.set_index('date')
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)
    
    # Removing outliers
    df = df[df['total_sales'] < df['total_sales'].quantile(0.95)]
    df = df[df['total_revenue'] < df['total_revenue'].quantile(0.95)]
    
    # Dropping object datatype columns
    object_columns = df.dtypes[df.dtypes == 'object'].index
    df.drop(object_columns, axis=1, inplace=True)
    
    df.dropna(inplace=True)
    return df

In [4]:
# Method for creating date features
def create_date_features(df):
    df = df.copy()
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    df['weekofyear'] = df['weekofyear'].astype(int)
    return df

In [16]:
# Method for creating lag features with 7, 14, 28, and 364 days
# 364 days was chosen instead of 365 to make it the same day of week
def create_lag_features(df, col):
    lags = [7]
    for l in lags:
        df[f'{col}_lag_{l}'] = df[col].shift(l)
    
    df = df.fillna(0)
    return df

In [15]:
# Method for creating rolling statistics features 
def create_rolling_features(df, target):
    windows = [7]
    for w in windows:
        df[f'rolling_mean_{w}'] = target.rolling(window=w).mean()
        df[f'rolling_median_{w}'] = target.rolling(window=w).median()
        df[f'rolling_min_{w}'] = target.rolling(window=w).min()
        df[f'rolling_max_{w}'] = target.rolling(window=w).max()
        df[f'rolling_std_{w}'] = target.rolling(window=w).std()
    
    df = df.fillna(0)
    return df

In [11]:
def one_hot_encode(df, column):
    ohe = OneHotEncoder(sparse=False)
    encoded_features = ohe.fit_transform(df[[column]])

    categories = ohe.categories_[0]
    column_names = [f"{column}_{category}" for category in categories]
    encoded_df = pd.DataFrame(encoded_features, columns=column_names)
    
    index = df.index
    df.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)
    
    df = pd.concat([df, encoded_df], axis=1)

    df.index = index
    return df

In [4]:
# Method for label encoding
def label_encoder(column):
    le = LabelEncoder()
    label = le.fit_transform(column)
    mapping = {index: label for index, label in enumerate(le.classes_)}
    column = label
    return column

In [None]:
def rename_duplicates(df):
    # Rename duplicate columns
    cols=pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated(keep=False)]: 
        cols[df.columns.get_loc(dup)] = ([dup + '.' + str(d_idx) 
                                        if d_idx != 0 
                                        else dup 
                                        for d_idx in range(df.columns.get_loc(dup).sum())]
                                        )

    df.columns = cols
    return df

In [4]:
def pipe_classifier_report(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

    models = [KNeighborsClassifier(3), SVC(kernel='rbf', gamma='auto', random_state=42), LGBMClassifier(random_state=42),
            RandomForestClassifier(random_state=42), XGBClassifier(random_state=42), CatBoostClassifier(random_state=42)]

    select = SelectFromModel(estimator=RandomForestClassifier(random_state=42))
    scores = []
    model_names = []

    for model in models:
        pipe = Pipeline([
            ('scaler', MinMaxScaler()),
            ('feature selection', select),
            ('models', model)
        ])
            
        pipe.fit(X_train, y_train)
        pipe.predict(X_test)
        scores.append(pipe.score(X_test, y_test))
        model_names.append(model.__class__.__name__)
        
    df_pipeline = pd.DataFrame({'Model': model_names, 'Accuracy': scores}).sort_values(by='Accuracy', ascending=False)
    
    return df_pipeline

In [21]:
def pipe_regressor(X_train, y_train, model):
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', model)
    ])
    regressor = pipe.fit(X_train, y_train)
    return regressor
    

In [6]:
def pipe_regressor_report(features, target, train_size):
    X_train, X_test = features[:train_size], features[train_size:]
    Y_train, Y_test = target[:train_size], target[train_size:]
    
    models = [XGBRegressor(random_state=42, n_jobs=-1), LGBMRegressor(random_state=42, n_jobs=-1),
          RandomForestRegressor(random_state=42), CatBoostRegressor(random_state=42)]
    model_names = ['XGBoost', 'LGBM', 'RF', 'CatBoost']

    rmse = []
    r2 = []
    
    for model in models:
        pipe = Pipeline([
            ('scaler', MinMaxScaler()),
            ('models', model)
        ])
        
        pipe.fit(X_train, Y_train)
        pipe_pred = pipe.predict(X_test)
        rmse.append(mean_squared_error(Y_test, pipe_pred, squared=False))
        r2.append(r2_score(Y_test, pipe_pred))
       
    df_pipeline = pd.DataFrame({'Model': model_names, 'RMSE': rmse, 'R2': r2}).sort_values(by='RMSE')
    return df_pipeline

In [1]:
def build_all_models(list_df_names, list_target_cols):
    model_metrics = []
    for df_name in list_df_names:
        df = globals()[df_name].copy()
        df = clean_data(df)
        df = create_date_features(df)
        for label in list_target_cols:
            df = create_lag_features(df, label)
            df = create_rolling_features(df, df[label])
            
            features = df.drop(list_target_cols, axis=1)
            target = df[label]
            train_s = round(len(df) * 0.80)
            
            df_pipeline = pipe(features, target, train_s)
            
            df_pipeline['DataFrame'] = df_name
            df_pipeline['Target'] = label
            model_metrics.append(df_pipeline)
            print('model completed')
    return model_metrics

In [10]:
def plot_timeseries_pred(Y_train, Y_test, predictions):
    plt.figure(figsize=(15, 6))
    plt.plot(Y_train.index, Y_train, label='Train', color='green')
    plt.plot(Y_test.index, Y_test, label='Test', color='red')
    plt.plot(Y_test.index, predictions, label='Predicted', color='blue')
    plt.xlabel('Date')
    plt.ylabel('Target')
    plt.title('Target Actual vs. Predicted')
    plt.legend()
    plt.show()

In [10]:
# Method for calculating metrics
# Returns metrics in a dataframe
def get_metrics(Y_test, prediction):
    rmse = mean_squared_error(Y_test, prediction, squared=False)
    r2 = r2_score(Y_test, prediction)
    metrics = pd.DataFrame([[rmse, r2]], columns=['RMSE', 'R2'])
    return metrics