In [1]:
# All packages to import
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import ensemble

from feature_engine import encoding as ce
from feature_engine import transformation as tran
from feature_engine import outliers as out
from feature_engine import selection as select

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing as prep
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import GradientBoostingRegressor


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from regressors import stats
import math
import time
import warnings
warnings.filterwarnings("ignore")

In [4]:
#stratified - 2
df=df.sample(frac=0.2, weights='sold',random_state=1).reset_index(drop=True)

In [6]:
# Remove columns with "cy"
'''If you are using bottoms_up, make remove = []'''
def RemoveCY(df,keep=["sold"]): # keep is the variable that have cy, but you want to keep
    remove=[] # cat is also a cy variable
    for col in df.columns:
        if (col.find('_cy')>=0 or col.find('cy_')>=0) and col not in keep:
            remove.append(col)
    return df.drop(columns=remove)
# prep.FunctionTransformer(RemoveCY,kw_args={"keep":keep})

#Remove non-unique columns
def dropSingles(df):
    drops = []
    for col in df.columns:
        if len(df[col].unique())==1:
            drops.append(col)
    df = df.drop(columns=drops)
    return df
# prep.FunctionTransformer(dropSingles)

df = RemoveCY(dropSingles(df))

In [7]:
target = df['sold']
predictors = df.drop(columns = ['columns'])
X_train, X_test, y_train, y_test = train_test_split(
    predictors, # predictors
    target,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

In [8]:
categories = []
discrete = []
PosContinuous = []
NegContinuous = [#undisclosed] # These will change
continuous = []
for col in X_train.columns:
    if col in NegContinuous:
        continuous.append(col)
    elif X_train[col].dtype.name =="object":
        categories.append(col)
    elif X_train[col].dtype.name == "int64":
        discrete.append(col)
    else:
        PosContinuous.append(col)
        continuous.append(col)

In [9]:
# Remove high correlation
def highCorr(df,keep,cutOff):
    '''df is the dataframe
        keep is a list of variables to not include in removing correlation
            probably a single target variable, but still put in list
        cutOff is the correlation cut off to remove'''
    df = df.drop(columns=keep)
    corr = df.corr()
    variables = corr.columns
    correlated_features = set()
    for r in range(len(variables)):
        for c in range(r):
            if abs(corr.iloc[r,c])>cutOff:
                colname = variables[r]
                correlated_features.add(colname)
    return df.drop(columns = correlated_features)
# prep.FunctionTransformer(highCorr,kw_args={"keep":keep,"cutOff":0.95})

# Less than zero
def MakePos(df,negs = None): #negs is the list of numeric columns to retain negative values
    for col in df. columns:
        if df[col].dtype.name =="category" or df[col].dtype.name =="object":
            continue
        if col in negs:
            continue
        if np.min(df[col])<0:
            d = df[col].copy()
            d = np.where(d < 0,0,d)
            df[col]=d
    return df
# prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})


In [10]:
def log_transform(X,variables): #Works, only pass variables >=0
    result = X.copy()
    for col in result.columns:
        if col in variables:
            result[col] = np.log(result[col]+1)
    return result
# prep.FunctionTransformer(log_transform, kw_args={"variables":var})

def ratios(X, variables, tuples = False): # Works
    result = X.copy()
    if tuples:
        for n,d in variables:
            denom = np.where(result[d]==0,0.001,result[d])
            result[n+"/"+d] = result[n]/denom
    else:
        for d in variables:
            denom = np.where(X[d]==0,0.001,X[d])
            for n in variables:
                if n!=d:
                    result[n+"/"+d] = result[n]/denom
    return result
# prep.FunctionTransformer(ratios, kw_args={"variables" : var, "tuples" : False})

In [11]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def MAPE(y_true,y_pred):
    y_true = np.where(y_true==0,0.001,y_true)
    return mean_absolute_percentage_error(y_true,y_pred)
def SMAPE(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [12]:
#naive forecast

y_actual= df.qty_sold_cy # replace with y_train or y_test as needed
y_prediction = df.qty_sold_py # replace with y_pred_tr or y_pred_te as needed

print("RMSE:",math.sqrt(mean_squared_error(y_actual,y_prediction)))
#print("MAPE:",mean_absolute_percentage_error(y_actual,y_prediction))
print("MAE:",mean_absolute_error(y_actual, y_prediction))
print("R^2:",r2_score(y_actual, y_prediction))

RMSE: 215.77513124837398
MAE: 52.859421292826596
R^2: 0.32030268127507444


In [13]:
#Pipeline Test

#Some notes about pipes. They can take a variable that mapes to a function
# To skip a step, put None for the function.
# If an arguement that I made requires a list, but you don't want one, put []
# If you want to do all variables removing in the pipeline, it must happen
# after any of the general variable lists are called. (category,NegContiuous,etc.)
mod = Ridge()
pipe = Pipeline([
    ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
    ("rare", ce.RareLabelEncoder(tol=0.01,n_categories=7,variables=categories)),
    ("cat_encode",ce.CountFrequencyEncoder(encoding_method = "count")),
    ("num_encode",prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous})),
    ("outlier", out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables= continuous)),
    #("scaling",prep.StandardScaler()),
    #("nzv", select.DropConstantFeatures(tol=0.95)),
    ("filter_corr", select.DropCorrelatedFeatures(threshold=0.95)),
    #("drop_dup", select.DropDuplicateFeatures()),
    #("feature_select", select.SelectByShuffling(estimator = Ridge() , scoring = "r2", cv=5,random_state=0)),
    ("model",mod)
],verbose=True)

pipe.fit(X_train,y_train)

[Pipeline] ........... (step 1 of 7) Processing makePos, total=   0.0s
[Pipeline] .............. (step 2 of 7) Processing rare, total=   0.6s
[Pipeline] ........ (step 3 of 7) Processing cat_encode, total=   0.3s
[Pipeline] ........ (step 4 of 7) Processing num_encode, total=   0.1s
[Pipeline] ........... (step 5 of 7) Processing outlier, total=   1.0s
[Pipeline] ....... (step 6 of 7) Processing filter_corr, total=   1.4s
[Pipeline] ............. (step 7 of 7) Processing model, total=   0.2s


Pipeline(steps=[('makePos',
                 FunctionTransformer(func=<function MakePos at 0x0000024843228B80>,
                                     kw_args={'negs': ['unit_sales',
                                                       'other_unit_pls_lost_sales',
                                                       'adjusted_avg_cluster_sales',
                                                       'other_unit_pls_lost_sales_py',
                                                       'avg_cluster_unit_sales',
                                                       'ntrans_wt0_py',
                                                       'ntrans_wt0_ppy',
                                                       'weighted_lookup_cnt',
                                                       'avg_cluster_total_sales',
                                                       'adj_avg_cluster_total_sales']})),
                ('...
                                                   'adj_avg_clust

In [14]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)


print("RMSE TR:",math.sqrt(mean_squared_error(y_train,y_tr_pred)))
print("MAE TR:",mean_absolute_error(y_train,y_tr_pred))
print("R2 TR:",r2_score(y_train,y_tr_pred))
print("MAPE TR:",MAPE(y_train,y_tr_pred))
print("RMSE TE:",math.sqrt(mean_squared_error(y_test,y_te_pred)))
print("MAE TE:",mean_absolute_error(y_test,y_te_pred))
print("R2 TE:",r2_score(y_test,y_te_pred))
print("MAPE TE:",MAPE(y_test,y_te_pred))


#importance = model.feature_importance_
#IMPORTANCE.append(importance)
      
#pvalue_array = stats.coef_pval(model, X_test, y_test)
#pvalue_array = np.delete(pvalue_array, 0)




RMSE TR: 198.06102934984654
MAE TR: 66.27519854645193
R2 TR: 0.44650961839490144
MAPE TR: 275.63872849645986
RMSE TE: 178.3452895570488
MAE TE: 65.75975759495205
R2 TE: 0.4947936570886662
MAPE TE: 270.40613520043996


In [27]:
# Preprocessing before the loop
pipe=Pipeline([("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
               ("rare", ce.RareLabelEncoder(tol=0.01,n_categories=7)),
               #("nzv", select.DropConstantFeatures(tol=0.95)),
               ("filter_corr", select.DropCorrelatedFeatures(threshold=0.84)),])
pipe.fit(X_train,y_train)

X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [19]:
#Test cat for loop
cat = [ce.OneHotEncoder(top_categories=None, variables = categories,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count",variables = categories),
       ce.MeanEncoder(variables = categories)]


for i in range(len(cat)):

    c= cat[i]
       
    pipe = Pipeline([
        ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
        ("rare", ce.RareLabelEncoder(tol=0.01,n_categories=7,variables=categories)),
        ("cat_encode",c),
        ("num_encode",prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous})),
        ("outlier", out.Winsorizer(capping_method = "quantiles", tail = "both",fold=0.01,variables=continuous)),
        ("filter_corr", prep.FunctionTransformer(highCorr,kw_args={"keep":[],"cutOff":0.9})),
        #("drop_dup", select.DropDuplicateFeatures()),
        ("feature_select", select.SelectByShuffling(estimator = m , scoring = "r2", cv=5,random_state=0)),
        ("model",Ridge())
    ],verbose=True)
    
    pipe.fit(X_train, y_train)

    y_tr_pred = pipe.predict(X_train)
    y_te_pred = pipe.predict(X_test)
    
    print(c)
    print("Train MSE", math.sqrt(mean_squared_error(y_train,y_tr_pred)))
    print("Train MAE", mean_absolute_error(y_train,y_tr_pred))
    print("Train SMAPE", SMAPE(y_train,y_tr_pred))
    print("Train R2", r2_score(y_train,y_tr_pred))
    print("Test MSE", math.sqrt(mean_squared_error(y_test,y_te_pred)))
    print("Test MAE", mean_absolute_error(y_test,y_te_pred))
    print("Train SMAPE", SMAPE(y_test,y_te_pred))
    print("Test R2", r2_score(y_test,y_te_pred))



[Pipeline] ........... (step 1 of 8) Processing makePos, total=   0.0s




[Pipeline] .............. (step 2 of 8) Processing rare, total=   0.9s
[Pipeline] ........ (step 3 of 8) Processing cat_encode, total=   0.7s
[Pipeline] ........ (step 4 of 8) Processing num_encode, total=   0.3s
[Pipeline] ........... (step 5 of 8) Processing outlier, total=   1.3s
[Pipeline] ............... (step 6 of 8) Processing nzv, total=   1.1s
[Pipeline] ....... (step 7 of 8) Processing filter_corr, total=   2.4s
[Pipeline] ............. (step 8 of 8) Processing model, total=   0.2s
OneHotEncoder(drop_last=True,
              variables=['bpg', 'store_number', 'sku_number', 'mpog_id'])
Train MSE 208.03852121267693
Train MAE 64.66301795819514
Train SMAPE 219.97196734032727
Train R2 0.3893399193829694
Test MSE 188.27132869205263
Test MAE 64.1413538339882
Train SMAPE 215.95050884863744
Test R2 0.43699288519530166
[Pipeline] ........... (step 1 of 8) Processing makePos, total=   0.1s




[Pipeline] .............. (step 2 of 8) Processing rare, total=   1.1s
[Pipeline] ........ (step 3 of 8) Processing cat_encode, total=   0.7s
[Pipeline] ........ (step 4 of 8) Processing num_encode, total=   0.4s
[Pipeline] ........... (step 5 of 8) Processing outlier, total=   1.6s
[Pipeline] ............... (step 6 of 8) Processing nzv, total=   1.6s
[Pipeline] ....... (step 7 of 8) Processing filter_corr, total=   2.0s
[Pipeline] ............. (step 8 of 8) Processing model, total=   0.2s
CountFrequencyEncoder(variables=['bpg', 'store_number', 'sku_number',
                                 'mpog_id'])
Train MSE 207.99975941796285
Train MAE 64.67346298616464
Train SMAPE 219.6179934929367
Train R2 0.38956745489341593
Test MSE 188.2199360270775
Test MAE 64.1356264361981
Train SMAPE 215.61162914555192
Test R2 0.4373002127857084
[Pipeline] ........... (step 1 of 8) Processing makePos, total=   0.1s




[Pipeline] .............. (step 2 of 8) Processing rare, total=   1.2s
[Pipeline] ........ (step 3 of 8) Processing cat_encode, total=   0.8s
[Pipeline] ........ (step 4 of 8) Processing num_encode, total=   0.4s
[Pipeline] ........... (step 5 of 8) Processing outlier, total=   1.9s
[Pipeline] ............... (step 6 of 8) Processing nzv, total=   1.0s
[Pipeline] ....... (step 7 of 8) Processing filter_corr, total=   2.0s
[Pipeline] ............. (step 8 of 8) Processing model, total=   0.2s
MeanEncoder(variables=['bpg', 'store_number', 'sku_number', 'mpog_id'])
Train MSE 207.996889888849
Train MAE 64.68316021754893
Train SMAPE 219.73460841159337
Train R2 0.3895842976232573
Test MSE 188.21715792208786
Test MAE 64.1433492863226
Train SMAPE 215.70895557140477
Test R2 0.4373168234336867


In [None]:
result1 = pd.DataFrame({"MODEL":MODEL,
                       "CATEGORICAL":CATEGORICAL,
                       "NUMERIC":NUMERIC,
                        "OUTLIER":OUTLIER,
                        #"SCALER":SCALER,
                        #"FEATURE_SELECTION":FEATURE_SELECTION,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE})

In [None]:
results1.to_csv('results1.csv')

In [64]:
cat = [ce.OneHotEncoder(top_categories=None,drop_last=True),
      ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [None,
       tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
       #tran.LogTransformer(base="10", variables = continuous),
       prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous})
      ]
mod = [LinearRegression()]
outlier = [None,
           out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables= continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables= continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables= continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables= continuous),
          ]

#scale = [prep.StandardScaler(),None]

MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR= []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE= []

for m in mod: # loop through the models
    print(m)
    #feat_select = [select.SelectByShuffling(estimator = m , scoring = "r2", cv=5,random_state=0),
     #         select.SelectBySingleFeaturePerformance(estimator = m , scoring = "r2", cv=5),
     #         select.RecursiveFeatureElimination(estimator = m , scoring = "r2", cv=5,threshold=0.01),
     #        select.RecursiveFeatureAddition(estimator = m , scoring = "r2", cv=5,threshold=0.01),
     #         None]
    for n in num: # loop through the numeric transformations
        print(n)
        for c in cat: # loop through the categorical encoding
            print(c)
            for o in outlier:
                print(o)
                #for s in scale:
                    #print(s)
                #for f in feat_select:
                    #print(f)
                pipe = Pipeline([
                        ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                        ("rare", ce.RareLabelEncoder(tol=0.01,n_categories=7)),
                        ("cat_encode",c),
                        ("num_encode",n),
                        ("outlier", o),
                        #("nzv", select.DropConstantFeatures(tol=0.95)),
                        ("filter_corr", select.DropCorrelatedFeatures(threshold=0.84)),
                        #("feature_select", f),
                        #("scaling",s),
                        ("model",m)
                    ],verbose=True)
                try:
                    pipe.fit(X_train,y_train)
                    print("Fit")
                    y_tr_pred = pipe.predict(X_train)
                    y_te_pred = pipe.predict(X_test)

                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    #SCALER.append(s)
                    #FEATURE_SELECTION.append(f)
                    RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                    MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                    R2_TR.append(r2_score(y_train,y_tr_pred))
                    MAPE_TR.append(MAPE(y_train,y_tr_pred))
                    SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                    RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                    MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                    R2_TE.append(r2_score(y_test,y_te_pred))
                    MAPE_TE.append(MAPE(y_test,y_te_pred))
                    SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                except:
                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    #SCALER.append(s)
                    #FEATURE_SELECTION.append(f)
                    RMSE_TR.append(".")
                    MAE_TR.append(".")
                    R2_TR.append(".")
                    MAPE_TR.append(".")
                    SMAPE_TR.append(".")
                    RMSE_TE.append(".")
                    MAE_TE.append(".")
                    R2_TE.append(".")
                    MAPE_TE.append(".")
                    SMAPE_TE.append(".")

        

LinearRegression()
None
OneHotEncoder(drop_last=True)
None
[Pipeline] ........... (step 1 of 7) Processing makePos, total=   0.0s
[Pipeline] .............. (step 2 of 7) Processing rare, total=   0.7s
[Pipeline] ........ (step 3 of 7) Processing cat_encode, total=   0.5s
[Pipeline] ........ (step 4 of 7) Processing num_encode, total=   0.0s
[Pipeline] ........... (step 5 of 7) Processing outlier, total=   0.0s
[Pipeline] ....... (step 6 of 7) Processing filter_corr, total=   2.0s
[Pipeline] ............. (step 7 of 7) Processing model, total=   0.4s
Fit
Winsorizer(tail='both',
           variables=['pop_est', 'pop_density', 'total_vio',
                      'avg_cluster_unit_sales', 'adjusted_avg_cluster_sales',
                      'avg_cluster_total_sales', 'sales_signal',
                      'failure_sales', 'lifecycle', 'adjusted_lifecycle',
                      'adj_avg_cluster_total_sales', 'unit_sales',
                      'projected_growth_pct', 'other_unit_pls_lost_sale

In [65]:
results2 = pd.DataFrame({"MODEL":MODEL,
                       "CATEGORICAL":CATEGORICAL,
                       "NUMERIC":NUMERIC,
                        "OUTLIER":OUTLIER,
                        #"SCALER":SCALER,
                        #"FEATURE_SELECTION":FEATURE_SELECTION,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE})

In [66]:
results2

Unnamed: 0,MODEL,CATEGORICAL,NUMERIC,OUTLIER,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE
0,LinearRegression(),"OneHotEncoder(drop_last=True,\n v...",,,109.955282,30.374202,0.829414,49.010042,36.746117,135.515370,30.550626,0.708309,48.401044,36.546024
1,LinearRegression(),"OneHotEncoder(drop_last=True,\n v...",,"Winsorizer(tail='both',\n variables=...",183.717858,43.115816,0.523772,59.360881,59.691290,163.047841,42.388496,0.577744,58.745559,59.438631
2,LinearRegression(),"OneHotEncoder(drop_last=True,\n v...",,"Winsorizer(variables=['pop_est', 'pop_density'...",183.717808,43.115909,0.523772,59.361494,59.691988,163.048121,42.388744,0.577743,58.746248,59.439458
3,LinearRegression(),"OneHotEncoder(drop_last=True,\n v...",,"Winsorizer(tail='left',\n variables=...",109.955419,30.374272,0.829413,49.010061,36.746096,135.515070,30.550545,0.708311,48.401130,36.546025
4,LinearRegression(),"OneHotEncoder(drop_last=True,\n v...",,"Winsorizer(capping_method='iqr', tail='both',\...",218.539292,60.519828,0.326138,101.731953,80.374363,199.297261,60.280919,0.369118,100.870807,80.046572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,LinearRegression(),"MeanEncoder(variables=['bpg', 'store_number', ...",FunctionTransformer(func=<function log_transfo...,"Winsorizer(capping_method='iqr',\n v...",214.485727,78.027135,0.350904,235.194473,96.785368,195.557157,77.762680,0.392575,234.212992,96.408545
86,LinearRegression(),"MeanEncoder(variables=['bpg', 'store_number', ...",FunctionTransformer(func=<function log_transfo...,"Winsorizer(capping_method='iqr', tail='left',\...",204.206880,66.490494,0.411627,180.653389,89.532037,184.911592,66.047890,0.456908,179.379251,89.000924
87,LinearRegression(),"MeanEncoder(variables=['bpg', 'store_number', ...",FunctionTransformer(func=<function log_transfo...,"Winsorizer(capping_method='quantiles', fold=0....",220.360900,62.721903,0.314857,125.487235,79.991052,201.310367,62.367232,0.356308,124.695113,79.774827
88,LinearRegression(),"MeanEncoder(variables=['bpg', 'store_number', ...",FunctionTransformer(func=<function log_transfo...,"Winsorizer(capping_method='quantiles', fold=0....",220.365562,62.906612,0.314828,127.147266,79.725769,201.425108,62.600949,0.355574,126.324473,79.583671


In [67]:
results2.to_csv('result4.csv')

In [14]:
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [#None,
       tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       #prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
       #prep.LogTransformer(base='10',variables=continuous)
      ]
mod = [#Ridge(),
       LinearRegression()
      ]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           None
          ]
#scale = [prep.StandardScaler(),None]

MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

for m in mod: # loop through the models
    print(m)
    feat_select = [
              #select.SelectByShuffling(estimator = m , scoring = "r2", cv=5,random_state=0),
              #select.SelectBySingleFeaturePerformance(estimator = m , scoring = "r2", cv=5),
             select.RecursiveFeatureElimination(estimator = m , scoring = "r2", cv=5,threshold=0.01),
             select.RecursiveFeatureAddition(estimator = m , scoring = "r2", cv=5,threshold=0.01),
             #None
                  ]
    for n in num: # loop through the numeric transformations
        print(n)
        for c in cat: # loop through the categorical encoding
            print(c)
            for o in outlier:
                print(o)
                #for s in scale:
                    #print(s)
                for f in feat_select:
                    print(f)
                    pipe = Pipeline([
                            ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                            ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                            ("cat_encode",c),
                            ("num_encode",n),
                            ("outlier", o),
                            ("filter_corr", prep.FunctionTransformer(highCorr,kw_args={"keep":[],"cutOff":0.95})),
                            ("feature_select", f),
                            #("scaling",s),
                            ("model",m)
                        ],verbose=True)
                    try:
                        pipe.fit(X_train,y_train)
                        print("Fit")
                        y_tr_pred = pipe.predict(X_train)
                        y_te_pred = pipe.predict(X_test)

                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        #SCALER.append(s)
                        FEATURE_SELECTION.append(f)
                        RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                        MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                        R2_TR.append(r2_score(y_train,y_tr_pred))
                        MAPE_TR.append(MAPE(y_train,y_tr_pred))
                        SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                        RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                        MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                        R2_TE.append(r2_score(y_test,y_te_pred))
                        MAPE_TE.append(MAPE(y_test,y_te_pred))
                        SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                    except:
                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        #SCALER.append(s)
                        FEATURE_SELECTION.append(f)
                        RMSE_TR.append(".")
                        MAE_TR.append(".")
                        R2_TR.append(".")
                        MAPE_TR.append(".")
                        SMAPE_TR.append(".")
                        RMSE_TE.append(".")
                        MAE_TE.append(".")
                        R2_TE.append(".")
                        MAPE_TE.append(".")
                        SMAPE_TE.append(".")



LinearRegression()
YeoJohnsonTransformer(variables=['pop_est', 'pop_density', 'total_vio',
                                 'avg_cluster_unit_sales',
                                 'adjusted_avg_cluster_sales',
                                 'avg_cluster_total_sales', 'sales_signal',
                                 'failure_sales', 'lifecycle',
                                 'adjusted_lifecycle',
                                 'adj_avg_cluster_total_sales', 'unit_sales',
                                 'projected_growth_pct',
                                 'other_unit_pls_lost_sales_py',
                                 'other_unit_pls_lost_sales',
                                 'weighted_lookup_cnt', 'qty_wt0_ppy',
                                 'ntrans_wt0_ppy', 'qty_wt0_py',
                                 'ntrans_wt0_py', 'ntrans_wt0',
                                 'unadjusted_total_vio',
                                 'vio_compared_to_cluster', 'qty_wt0',
   

In [15]:
results3 = pd.DataFrame({"MODEL":MODEL,
                       "CATEGORICAL":CATEGORICAL,
                       "NUMERIC":NUMERIC,
                        "OUTLIER":OUTLIER,
                        #"SCALER":SCALER,
                        "FEATURE_SELECTION":FEATURE_SELECTION,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE})

In [16]:
results3

Unnamed: 0,MODEL,CATEGORICAL,NUMERIC,OUTLIER,FEATURE_SELECTION,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE
0,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(tail='both',\n variables=...","RecursiveFeatureElimination(cv=5, estimator=Li...",213.333805,69.09756,0.357858,266.012896,84.084324,194.07718,68.487129,0.401734,260.469342,83.636146
1,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(tail='both',\n variables=...","RecursiveFeatureAddition(cv=5, estimator=Linea...",214.589194,66.131919,0.350278,254.369999,80.478846,195.44214,65.678253,0.393289,249.832034,80.029183
2,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(variables=['pop_est', 'pop_density'...","RecursiveFeatureElimination(cv=5, estimator=Li...",219.067884,74.57885,0.322874,297.98186,88.272291,200.365389,74.495753,0.362337,293.688644,88.052591
3,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(variables=['pop_est', 'pop_density'...","RecursiveFeatureAddition(cv=5, estimator=Linea...",218.046758,71.0423,0.329172,285.672029,84.286898,199.206524,70.785163,0.369692,280.282836,83.875966
4,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(tail='left',\n variables=...","RecursiveFeatureElimination(cv=5, estimator=Li...",205.661587,71.765056,0.403214,272.092041,87.935817,185.660075,70.886692,0.452502,265.120055,87.23572
5,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(tail='left',\n variables=...","RecursiveFeatureAddition(cv=5, estimator=Linea...",208.042839,72.078063,0.389315,276.327756,88.241395,188.758327,71.333312,0.434076,270.067993,87.716027
6,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(capping_method='iqr', tail='both',\...","RecursiveFeatureElimination(cv=5, estimator=Li...",212.391621,76.656155,0.363517,304.456783,91.611917,193.496035,76.274386,0.405311,296.313589,91.011127
7,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(capping_method='iqr', tail='both',\...","RecursiveFeatureAddition(cv=5, estimator=Linea...",214.3656,76.469467,0.351631,317.083079,89.834068,195.516399,76.120018,0.392828,309.972563,89.421973
8,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(capping_method='iqr',\n v...","RecursiveFeatureElimination(cv=5, estimator=Li...",214.636951,77.265728,0.349989,323.954089,89.927583,195.780377,76.919811,0.391187,316.755502,89.512926
9,LinearRegression(),"CountFrequencyEncoder(variables=['bpg', 'store...","YeoJohnsonTransformer(variables=['pop_est', 'p...","Winsorizer(capping_method='iqr',\n v...","RecursiveFeatureAddition(cv=5, estimator=Linea...",217.806657,75.834448,0.330648,319.313053,88.442508,199.233025,75.656892,0.369525,314.180914,88.028608


In [17]:
results3.to_csv('result8.csv')

In [77]:
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       #ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [#None,
       tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       #prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
       #prep.LogTransformer(base='10',variables=continuous)
      ]
mod = [Ridge()]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           #None
          ]

scale = [True,False]

MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

PVAL = []
IMPORTANCE = []
FEATURES = []


for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                X_tr = X_train.copy()
                X_te = X_test.copy()
                print(m)
                print(n)
                print(c)
                print(o)
                print(s)
                pipe = Pipeline([
                        ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                        ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                        ("cat_encode",c),
                        ("num_encode",n),
                        ("outlier", o),
                        ("filter_corr", prep.FunctionTransformer(highCorr,kw_args={"keep":[],"cutOff":0.84}))
                    ],verbose=True)
                try:
                    pipe.fit(X_tr,y_train)
                    X_tr = pipe.transform(X_tr)
                    X_te = pipe.transform(X_te)
                    names = X_te.columns
                    if s:
                            scale = prep.StandardScaler()
                            scale.fit(X_tr,y_train)
                            X_tr = scale.transform(X_tr)
                            X_te = scale.transform(X_te)
                        model = m.copy()
                        model.fit(X_tr,y_train)
                        print("Fit")
                        y_tr_pred = model.predict(X_tr)
                        y_te_pred = model.predict(X_te)
                        try:
                            importance = model.feature_importance_
                            IMPORTANCE.append(importance)
                            PVAL.append(".")
                        except:
                            pvalue_array = stats.coef_pval(model, X_test, y_test)
                            pvalue_array = np.delete(pvalue_array, 0)
                            PVAL.append(pvalue_array)
                            IMPORTANCE.append(".")

                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        SCALER.append(s)
                        RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                        MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                        R2_TR.append(r2_score(y_train,y_tr_pred))
                        MAPE_TR.append(MAPE(y_train,y_tr_pred))
                        SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                        RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                        MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                        R2_TE.append(r2_score(y_test,y_te_pred))
                        MAPE_TE.append(MAPE(y_test,y_te_pred))
                        SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                        FEATURES.append(names)

                        
                    except:
                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        SCALER.append(s)
                        RMSE_TR.append(".")
                        MAE_TR.append(".")
                        R2_TR.append(".")
                        MAPE_TR.append(".")
                        SMAPE_TR.append(".")
                        RMSE_TE.append(".")
                        MAE_TE.append(".")
                        R2_TE.append(".")
                        MAPE_TE.append(".")
                        SMAPE_TE.append(".")
                        FEATURES.append(".")
                        PVAL.append(".")
                        IMPORTANCE.append(".")
                        
      

Ridge()
YeoJohnsonTransformer(variables=['pop_est', 'pop_density', 'total_vio',
                                 'avg_cluster_unit_sales',
                                 'adjusted_avg_cluster_sales',
                                 'avg_cluster_total_sales', 'sales_signal',
                                 'failure_sales', 'lifecycle',
                                 'adjusted_lifecycle',
                                 'adj_avg_cluster_total_sales', 'unit_sales',
                                 'projected_growth_pct',
                                 'other_unit_pls_lost_sales_py',
                                 'other_unit_pls_lost_sales',
                                 'weighted_lookup_cnt', 'qty_wt0_ppy',
                                 'ntrans_wt0_ppy', 'qty_wt0_py',
                                 'ntrans_wt0_py', 'ntrans_wt0',
                                 'unadjusted_total_vio',
                                 'vio_compared_to_cluster', 'qty_wt0',
              

TypeError: 'StandardScaler' object is not iterable

In [None]:

results = pd.DataFrame({"MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        "SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       "FEATURES":FEATURES,
                       "PVAL":PVAL,
                       "IMPORTANCE":IMPORTANCE})      

In [78]:
results = pd.DataFrame({"MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        "SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       "FEATURES":FEATURES,
                       "PVAL":PVAL,
                       "IMPORTANCE":IMPORTANCE})  

In [79]:
results

Unnamed: 0,MODEL,NUMERIC,CATEGORICAL,OUTLIER,SCALER,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE,FEATURES,PVAL,IMPORTANCE
0,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",True,.,.,.,.,.,.,.,.,.,.,.,.,.
1,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",False,.,.,.,.,.,.,.,.,.,.,.,.,.


In [21]:
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       #ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [#None,
       tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       #prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
       #prep.LogTransformer(base='10',variables=continuous)
      ]
mod = [Ridge()]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           #out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           #None
          ]

scale = [True,False]

MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

PVAL = []
IMPORTANCE = []
FEATURES = []


for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                for s in scale:
                    X_tr = X_train.copy()
                    X_te = X_test.copy()
                    print(m)
                    print(n)
                    print(c)
                    print(o)
                    print(s)
                    pipe = Pipeline([
                        ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                        ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                        ("cat_encode",c),
                        ("num_encode",n),
                        ("outlier", o),
                        ("filter_corr", prep.FunctionTransformer(highCorr,kw_args={"keep":[],"cutOff":0.84}))
                    ],verbose=True)
                    try:
                        pipe.fit(X_tr,y_train)
                        X_tr = pipe.transform(X_tr)
                        X_te = pipe.transform(X_te)
                        names = X_te.columns
                        if s:
                            scale = prep.StandardScaler()
                            scale.fit(X_tr,y_train)
                            X_tr = scale.transform(X_tr)
                            X_te = scale.transform(X_te)
                        model = m.copy()
                        model.fit(X_tr,y_train)
                        print("Fit")
                        y_tr_pred = model.predict(X_tr)
                        y_te_pred = model.predict(X_te)
                        try:
                            importance = model.feature_importance_
                            IMPORTANCE.append(importance)
                            PVAL.append(".")
                        except:
                            pvalue_array = stats.coef_pval(model, X_test, y_test)
                            pvalue_array = np.delete(pvalue_array, 0)
                            PVAL.append(pvalue_array)
                            IMPORTANCE.append(".")

                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        SCALER.append(s)
                        RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                        MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                        R2_TR.append(r2_score(y_train,y_tr_pred))
                        MAPE_TR.append(MAPE(y_train,y_tr_pred))
                        SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                        RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                        MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                        R2_TE.append(r2_score(y_test,y_te_pred))
                        MAPE_TE.append(MAPE(y_test,y_te_pred))
                        SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                        FEATURES.append(names)

                        
                    except:
                        MODEL.append(m)
                        CATEGORICAL.append(c)
                        NUMERIC.append(n)
                        OUTLIER.append(o)
                        SCALER.append(s)
                        RMSE_TR.append(".")
                        MAE_TR.append(".")
                        R2_TR.append(".")
                        MAPE_TR.append(".")
                        SMAPE_TR.append(".")
                        RMSE_TE.append(".")
                        MAE_TE.append(".")
                        R2_TE.append(".")
                        MAPE_TE.append(".")
                        SMAPE_TE.append(".")
                        FEATURES.append(".")
                        PVAL.append(".")
                        IMPORTANCE.append(".")
                        

results = pd.DataFrame({"MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        "SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       "FEATURES":FEATURES,
                       "PVAL":PVAL,
                       "IMPORTANCE":IMPORTANCE})            

Ridge()
YeoJohnsonTransformer(variables=['pop_est', 'pop_density', 'total_vio',
                                 'avg_cluster_unit_sales',
                                 'adjusted_avg_cluster_sales',
                                 'avg_cluster_total_sales', 'sales_signal',
                                 'failure_sales', 'lifecycle',
                                 'adjusted_lifecycle',
                                 'adj_avg_cluster_total_sales', 'unit_sales',
                                 'projected_growth_pct',
                                 'other_unit_pls_lost_sales_py',
                                 'other_unit_pls_lost_sales',
                                 'weighted_lookup_cnt', 'qty_wt0_ppy',
                                 'ntrans_wt0_ppy', 'qty_wt0_py',
                                 'ntrans_wt0_py', 'ntrans_wt0',
                                 'unadjusted_total_vio',
                                 'vio_compared_to_cluster', 'qty_wt0',
              

TypeError: 'StandardScaler' object is not iterable

In [14]:
from sklearn.linear_model import LassoLars
from sklearn.linear_model import BayesianRidge

In [30]:
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [#None,
       #tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
       #prep.LogTransformer(base='10',variables=continuous)
      ]
mod = [#Ridge(), 
       #LinearRegression(),
       #LassoLars(alpha=0.1),
       #BayesianRidge(),
       GradientBoostingRegressor(random_state=0),
      ]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           None
          ]

scale = [True]

d = pd.DataFrame()
TYPE = []
MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

PVAL = []
IMPORTANCE = []
FEATURES = []
TIME = []
ID = []
i=1

for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                for s in [True]:
                    id = "G"+str(i)
                    time_start = time.perf_counter()
                    X_tr = X_train.copy()
                    X_te = X_test.copy()
                    print(m)
                    print(n)
                    print(c)
                    print(o)
                    #print(s)
                    pipe = Pipeline([
                            ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                            ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                            ("cat_encode",c),
                            ("num_encode",n),
                            ("outlier", o),
                            ("filter_corr", select.DropCorrelatedFeatures(threshold=0.84))
                        ],verbose=True)
                        #try:
                    pipe.fit(X_tr,y_train)
                    X_tr = pipe.transform(X_tr)
                    X_te = pipe.transform(X_te)
                    names = X_te.columns
                    if s:
                        scale = prep.StandardScaler()
                        scale.fit(X_tr,y_train)
                        X_tr = scale.transform(X_tr)
                        X_te = scale.transform(X_te)
                    model = m
                    model.fit(X_tr,y_train)
                    print("Fit")
                    y_tr_pred = model.predict(X_tr)
                    y_te_pred = model.predict(X_te)
                    try:
                        importance = model.feature_importances_
                        TYPE.append("Tree")
                        importance = np.append(importance,id)
                        names = np.append(names,"ID")
                        d2 = pd.DataFrame([importance],columns = names)
                        d=pd.concat([d,d2]) 
                    except:
                        try:
                            importance = stats.coef_pval(model, X_tr, y_train)
                            importance = np.delete(importance, 0)
                            importance = np.append(importance,id)
                            names = np.append(names,"ID")
                            TYPE.append("Linear")
                            d2 = pd.DataFrame([importance],columns = names)
                            d=pd.concat([d,d2]) 
                        except:
                            TYPE.append("Not Sure")

                    time_stop = time.perf_counter()
                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    #SCALER.append(s)
                    RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                    MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                    R2_TR.append(r2_score(y_train,y_tr_pred))
                    MAPE_TR.append(MAPE(y_train,y_tr_pred))
                    SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                    RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                    MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                    R2_TE.append(r2_score(y_test,y_te_pred))
                    MAPE_TE.append(MAPE(y_test,y_te_pred))
                    SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                    FEATURES.append(names)
                    TIME.append((time_stop-time_start)/60)
                    ID.append(id)
                    i+=1
                    print((time_stop-time_start)/60)
                        
#                     except:
#                         time_stop = time.perf_counter()
#                         MODEL.append(m)
#                         CATEGORICAL.append(c)
#                         NUMERIC.append(n)
#                         OUTLIER.append(o)
#                         SCALER.append(s)
#                         RMSE_TR.append(".")
#                         MAE_TR.append(".")
#                         R2_TR.append(".")
#                         MAPE_TR.append(".")
#                         RMSE_TE.append(".")
#                         MAE_TE.append(".")
#                         R2_TE.append(".")
#                         MAPE_TE.append(".")
#                         TIME.append((time_stop-time_start)/60)
#                         ID.append(id)
#                         TYPE.append(".")
#                         i+=1

GradientBoostingRegressor(random_state=0)
FunctionTransformer(func=<function ratios at 0x00000293189A8D30>,
                    kw_args={'tuples': False,
                             'variables': ['pop_est', 'pop_density',
                                           'total_vio',
                                           'avg_cluster_unit_sales',
                                           'adjusted_avg_cluster_sales',
                                           'avg_cluster_total_sales',
                                           'sales_signal', 'failure_sales',
                                           'lifecycle', 'adjusted_lifecycle',
                                           'adj_avg_cluster_total_sales',
                                           'unit_sales', 'projected_growth_pct',
                                           'other_unit_pls_lost_sales_py',
                                           'other_unit_pls_lost_sales',
                                           'weighted_

In [31]:
results = pd.DataFrame({"ID":ID,
                        "TYPE":TYPE,
                       "TIME":TIME,
                        "MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        #"SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       #"P_value":PVAL,
                       #"IMPORTANCE":IMPORTANCE,
                       #"FEATURES":FEATURES
                       })

In [18]:
len(SCALER)

0

In [32]:
results

Unnamed: 0,ID,TYPE,TIME,MODEL,NUMERIC,CATEGORICAL,OUTLIER,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE
0,G1,Tree,44.932657,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(tail='both',\n variables=...",16.572329,6.101647,0.996125,13.933462,11.722541,23.122964,6.4574,0.991508,13.84878,11.660515
1,G2,Tree,41.738982,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(variables=['pop_est', 'pop_density'...",16.572329,6.101647,0.996125,13.933462,11.722541,23.025225,6.452838,0.991579,13.848675,11.6604
2,G3,Tree,41.514752,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(tail='left',\n variables=...",18.534062,7.010501,0.995153,17.525006,14.913661,27.621946,7.351739,0.987881,17.416507,14.872277
3,G4,Tree,42.174438,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr', tail='both',\...",16.914867,6.272864,0.995963,14.259008,12.079943,24.860569,6.691039,0.990183,14.208052,12.020935
4,G5,Tree,42.299867,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr',\n v...",16.914867,6.272864,0.995963,14.259008,12.079943,25.493483,6.701166,0.989677,14.207684,12.021054
5,G6,Tree,41.637657,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr', tail='left',\...",18.534062,7.010501,0.995153,17.525006,14.913661,27.546615,7.350187,0.987947,17.416532,14.872286
6,G7,Tree,42.717463,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",16.282727,6.045598,0.996259,14.011106,11.747227,22.997536,6.428045,0.991599,13.960318,11.69685
7,G8,Tree,43.191722,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",16.246071,6.061748,0.996276,14.214878,11.767608,22.660778,6.434329,0.991844,14.168637,11.719842
8,G9,Tree,42.352251,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",18.257186,7.032869,0.995297,16.967901,14.571964,27.10372,7.374269,0.988332,16.869672,14.527226
9,G10,Tree,41.42785,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...",,18.534062,7.010501,0.995153,17.525006,14.913661,27.567561,7.354435,0.987929,17.41706,14.872483


In [33]:
final=pd.merge(results,d,how = "left",on="ID")

In [34]:
final

Unnamed: 0,ID,TYPE,TIME,MODEL,NUMERIC,CATEGORICAL,OUTLIER,RMSE_TR,MAE_TR,R2_TR,...,pct_white/total_vio,adjusted_avg_cluster_sales/projected_growth_pct,other_unit_pls_lost_sales_py/projected_growth_pct,pct_blue_collar/ntrans_wt0_ppy,road_quality_index/ntrans_wt0_ppy,road_quality_index/ntrans_wt0,establishments/vio_compared_to_cluster,road_quality_index/vio_compared_to_cluster,age/pct_college,other_unit_pls_lost_sales/projected_growth_pct
0,G1,Tree,44.932657,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(tail='both',\n variables=...",16.572329,6.101647,0.996125,...,,,,,,,,,,
1,G2,Tree,41.738982,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(variables=['pop_est', 'pop_density'...",16.572329,6.101647,0.996125,...,,,,,,,,,,
2,G3,Tree,41.514752,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(tail='left',\n variables=...",18.534062,7.010501,0.995153,...,,,,,,,,,,
3,G4,Tree,42.174438,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr', tail='both',\...",16.914867,6.272864,0.995963,...,0.0,0.0009461892646678,0.0,6.680596695723603e-06,,,,,,
4,G5,Tree,42.299867,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr',\n v...",16.914867,6.272864,0.995963,...,0.0,0.0009456776253194,0.0,6.680596695723825e-06,,,,,,
5,G6,Tree,41.637657,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr', tail='left',\...",18.534062,7.010501,0.995153,...,,,,,,,,,,
6,G7,Tree,42.717463,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",16.282727,6.045598,0.996259,...,,0.0007147279172536,,,0.0,0.0,0.0,0.0,0.0,
7,G8,Tree,43.191722,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",16.246071,6.061748,0.996276,...,,0.0007147158478485,,,0.0,0.0,0.0,0.0,,
8,G9,Tree,42.352251,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",18.257186,7.032869,0.995297,...,,,,,,,,,6.953912885906334e-06,
9,G10,Tree,41.42785,([DecisionTreeRegressor(criterion='friedman_ms...,FunctionTransformer(func=<function ratios at 0...,"CountFrequencyEncoder(variables=['bpg', 'store...",,18.534062,7.010501,0.995153,...,,,,,,,,,,


In [35]:
final.to_csv('final_8.csv')