In [2]:
# All packages to import
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import ensemble

from feature_engine import encoding as ce
from feature_engine import transformation as tran
from feature_engine import outliers as out
from feature_engine import selection as select

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing as prep
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from regressors import stats
import math
import time

In [8]:
pip install regressors

Collecting regressors
  Downloading regressors-0.0.3.tar.gz (24 kB)
Building wheels for collected packages: regressors
  Building wheel for regressors (setup.py): started
  Building wheel for regressors (setup.py): finished with status 'done'
  Created wheel for regressors: filename=regressors-0.0.3-py2.py3-none-any.whl size=12377 sha256=d2484e52ca50fabcc6fee5dbab6c1fbd82cdbde19d44427ec225c06afa60aa67
  Stored in directory: c:\users\asd25\appdata\local\pip\cache\wheels\46\c3\e9\d1797db3a01a1f101ad3cbcdf01b1be7273a25c78c77dd4bb9
Successfully built regressors
Installing collected packages: regressors
Successfully installed regressors-0.0.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.DataFrame()
startPath = 'C://Users//asd25/'
bpg = ['AIR FILTERS','BATTERIES','EXHAUST','GASKETS','MOTOR OIL','WIPERS']
table = 'bottoms_up_gt'
for b in bpg:
    dff = pd.read_csv(startPath+b+' '+table+'.csv')
    #s= dff.sample(frac=0.3) 
    df = pd.concat([df,dff])
    print(b, df.shape)

AIR FILTERS (151746, 55)
BATTERIES (389928, 55)
EXHAUST (398057, 55)
GASKETS (486837, 55)
MOTOR OIL (1162184, 55)
WIPERS (1567803, 55)


In [5]:
#stratified - 2
df=df.sample(frac=0.2, weights='qty_sold_cy',random_state=1).reset_index(drop=True)

In [6]:
changeBU = {'sku_store_pdq':'object',
             'store_number':'object','sku_number':'object',
             'part_type':'object','mpog_id':'object',}
df = df.astype(changeBU)

In [7]:
# Remove columns with "cy"
'''If you are using bottoms_up, make remove = []'''
def RemoveCY(df,keep=["qty_sold_cy"]): # keep is the variable that have cy, but you want to keep
    remove=[] # cat is also a cy variable
    for col in df.columns:
        if (col.find('_cy')>=0 or col.find('cy_')>=0) and col not in keep:
            remove.append(col)
    return df.drop(columns=remove)
# prep.FunctionTransformer(RemoveCY,kw_args={"keep":keep})

#Remove non-unique columns
def dropSingles(df):
    drops = []
    for col in df.columns:
        if len(df[col].unique())==1:
            drops.append(col)
    df = df.drop(columns=drops)
    return df
# prep.FunctionTransformer(dropSingles)

df = RemoveCY(dropSingles(df))

In [8]:
target = df['qty_sold_cy']
predictors = df.drop(columns = ['qty_sold_cy','qty_sold_py','sku_store_pdq', 'qty_sold', 'filter_reason', 'platform_cluster_name', 'part_type'])
X_train, X_test, y_train, y_test = train_test_split(
    predictors, # predictors
    target,  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

In [9]:
categories = []
discrete = []
PosContinuous = []
NegContinuous = ["unit_sales", "other_unit_pls_lost_sales", 'adjusted_avg_cluster_sales',
                "other_unit_pls_lost_sales_py", "avg_cluster_unit_sales", 
                "ntrans_wt0_py", "ntrans_wt0_ppy", "weighted_lookup_cnt",
                "avg_cluster_total_sales", "adj_avg_cluster_total_sales"] # These will change
continuous = []
for col in X_train.columns:
    if col in NegContinuous:
        continuous.append(col)
    elif X_train[col].dtype.name =="object":
        categories.append(col)
    elif X_train[col].dtype.name == "int64":
        discrete.append(col)
    else:
        PosContinuous.append(col)
        continuous.append(col)

In [10]:
# Remove high correlation
def highCorr(df,keep,cutOff):
    '''df is the dataframe
        keep is a list of variables to not include in removing correlation
            probably a single target variable, but still put in list
        cutOff is the correlation cut off to remove'''
    df = df.drop(columns=keep)
    corr = df.corr()
    variables = corr.columns
    correlated_features = set()
    for r in range(len(variables)):
        for c in range(r):
            if abs(corr.iloc[r,c])>cutOff:
                colname = variables[r]
                correlated_features.add(colname)
    return df.drop(columns = correlated_features)
# prep.FunctionTransformer(highCorr,kw_args={"keep":keep,"cutOff":0.95})

# Less than zero
def MakePos(df,negs = None): #negs is the list of numeric columns to retain negative values
    for col in df. columns:
        if df[col].dtype.name =="category" or df[col].dtype.name =="object":
            continue
        if col in negs:
            continue
        if np.min(df[col])<0:
            d = df[col].copy()
            d = np.where(d < 0,0,d)
            df[col]=d
    return df
# prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})


In [11]:
def log_transform(X,variables): #Works, only pass variables >=0
    result = X.copy()
    for col in result.columns:
        if col in variables:
            result[col] = np.log(result[col]+1)
    return result
# prep.FunctionTransformer(log_transform, kw_args={"variables":var})

def ratios(X, variables, tuples = False): # Works
    '''the variables can either be a list of variables or a list of
    pairs of variables (numerator,denominator): set tuples = True.
    If tuples=False, then every variable in the list will be a numerator
    or denominator with every other variable. tuples=True is for when
    you have specific pairings you want to use.'''
    result = X.copy()
    if tuples:
        for n,d in variables:
            denom = np.where(result[d]==0,0.001,result[d])
            result[n+"/"+d] = result[n]/denom
    else:
        for d in variables:
            denom = np.where(X[d]==0,0.001,X[d])
            for n in variables:
                if n!=d:
                    result[n+"/"+d] = result[n]/denom
    return result
# prep.FunctionTransformer(ratios, kw_args={"variables" : var, "tuples" : False})

In [12]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def MAPE(y_true,y_pred):
    y_true = np.where(y_true==0,0.001,y_true)
    return mean_absolute_percentage_error(y_true,y_pred)
def SMAPE(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [13]:
#naive forecast

y_actual= df.qty_sold_cy # replace with y_train or y_test as needed
y_prediction = df.qty_sold_py # replace with y_pred_tr or y_pred_te as needed

print("RMSE:",math.sqrt(mean_squared_error(y_actual,y_prediction)))
#print("MAPE:",mean_absolute_percentage_error(y_actual,y_prediction))
print("MAE:",mean_absolute_error(y_actual, y_prediction))
print("R^2:",r2_score(y_actual, y_prediction))

RMSE: 215.77513124837398
MAE: 52.859421292826596
R^2: 0.32030268127507444


In [21]:
#Pipeline Test

#Some notes about pipes. They can take a variable that mapes to a function
# To skip a step, put None for the function.
# If an arguement that I made requires a list, but you don't want one, put []
# If you want to do all variables removing in the pipeline, it must happen
# after any of the general variable lists are called. (category,NegContiuous,etc.)
mod = Ridge()
pipe = Pipeline([
    ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
    ("rare", ce.RareLabelEncoder(tol=0.01,n_categories=7,variables=categories)),
    ("cat_encode",ce.CountFrequencyEncoder(encoding_method = "count")),
    ("num_encode",prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous})),
    ("outlier", out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables= continuous)),
    #("scaling",prep.StandardScaler()),
    #("nzv", select.DropConstantFeatures(tol=0.95)),
    ("filter_corr", select.DropCorrelatedFeatures(threshold=0.95)),
    #("drop_dup", select.DropDuplicateFeatures()),
    #("feature_select", select.SelectByShuffling(estimator = Ridge() , scoring = "r2", cv=5,random_state=0)),
    ("model",mod)
],verbose=True)

pipe.fit(X_train,y_train)

[Pipeline] ........... (step 1 of 7) Processing makePos, total=   0.1s
[Pipeline] .............. (step 2 of 7) Processing rare, total=   0.6s
[Pipeline] ........ (step 3 of 7) Processing cat_encode, total=   0.4s
[Pipeline] ........ (step 4 of 7) Processing num_encode, total=   0.2s
[Pipeline] ........... (step 5 of 7) Processing outlier, total=   0.9s
[Pipeline] ....... (step 6 of 7) Processing filter_corr, total=   1.3s
[Pipeline] ............. (step 7 of 7) Processing model, total=   0.2s


Pipeline(steps=[('makePos',
                 FunctionTransformer(func=<function MakePos at 0x00000236B862B5E0>,
                                     kw_args={'negs': ['unit_sales',
                                                       'other_unit_pls_lost_sales',
                                                       'adjusted_avg_cluster_sales',
                                                       'other_unit_pls_lost_sales_py',
                                                       'avg_cluster_unit_sales',
                                                       'ntrans_wt0_py',
                                                       'ntrans_wt0_ppy',
                                                       'weighted_lookup_cnt',
                                                       'avg_cluster_total_sales',
                                                       'adj_avg_cluster_total_sales']})),
                ('...
                                                   'adj_avg_clust

In [22]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)


print("RMSE TR:",math.sqrt(mean_squared_error(y_train,y_tr_pred)))
print("MAE TR:",mean_absolute_error(y_train,y_tr_pred))
print("R2 TR:",r2_score(y_train,y_tr_pred))
print("MAPE TR:",MAPE(y_train,y_tr_pred))
print("RMSE TE:",math.sqrt(mean_squared_error(y_test,y_te_pred)))
print("MAE TE:",mean_absolute_error(y_test,y_te_pred))
print("R2 TE:",r2_score(y_test,y_te_pred))
print("MAPE TE:",MAPE(y_test,y_te_pred))


#importance = model.feature_importance_
#IMPORTANCE.append(importance)
      
#pvalue_array = stats.coef_pval(model, X_test, y_test)
#pvalue_array = np.delete(pvalue_array, 0)




RMSE TR: 198.06102934984654
MAE TR: 66.27519854645352
R2 TR: 0.44650961839490144
MAPE TR: 275.63872849647663
RMSE TE: 178.34528955704891
MAE TE: 65.75975759495364
R2 TE: 0.4947936570886655
MAPE TE: 270.4061352004563


In [15]:
from sklearn.linear_model import LassoLars
from sklearn.linear_model import BayesianRidge

In [56]:
cat = [ce.OneHotEncoder(top_categories=None,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [None,
       tran.YeoJohnsonTransformer(variables= continuous),
       prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
      ]
mod = [Ridge(), 
       LinearRegression(),
       LassoLars(alpha=0.1),
       BayesianRidge(),
       #GradientBoostingRegressor(random_state=0),
      ]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           None
          ]

scale = [True,False]

d = pd.DataFrame()
TYPE = []
MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

PVAL = []
IMPORTANCE = []
FEATURES = []
TIME = []
ID = []
i=1

for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                for s in [True,False]:
                    id = "G"+str(i)
                    time_start = time.perf_counter()
                    X_tr = X_train.copy()
                    X_te = X_test.copy()
                    print(m)
                    print(n)
                    print(c)
                    print(o)
                    #print(s)
                    pipe = Pipeline([
                            ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                            ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                            ("cat_encode",c),
                            ("num_encode",n),
                            ("outlier", o),
                            ("filter_corr", select.DropCorrelatedFeatures(threshold=0.84))
                        ],verbose=True)
                        #try:
                    pipe.fit(X_tr,y_train)
                    X_tr = pipe.transform(X_tr)
                    X_te = pipe.transform(X_te)
                    names = X_te.columns
                    if s:
                        scale = prep.StandardScaler()
                        scale.fit(X_tr,y_train)
                        X_tr = scale.transform(X_tr)
                        X_te = scale.transform(X_te)
                    model = m
                    model.fit(X_tr,y_train)
                    print("Fit")
                    y_tr_pred = model.predict(X_tr)
                    y_te_pred = model.predict(X_te)
                    try:
                        importance = model.feature_importances_
                        TYPE.append("Tree")
                        importance = np.append(importance,id)
                        names = np.append(names,"ID")
                        d2 = pd.DataFrame([importance],columns = names)
                        d=pd.concat([d,d2]) 
                    except:
                        try:
                            importance = stats.coef_pval(model, X_tr, y_train)
                            importance = np.delete(importance, 0)
                            importance = np.append(importance,id)
                            names = np.append(names,"ID")
                            TYPE.append("Linear")
                            d2 = pd.DataFrame([importance],columns = names)
                            d=pd.concat([d,d2]) 
                        except:
                            TYPE.append("Not Sure")

                    time_stop = time.perf_counter()
                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    #SCALER.append(s)
                    RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                    MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                    R2_TR.append(r2_score(y_train,y_tr_pred))
                    MAPE_TR.append(MAPE(y_train,y_tr_pred))
                    SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                    RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                    MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                    R2_TE.append(r2_score(y_test,y_te_pred))
                    MAPE_TE.append(MAPE(y_test,y_te_pred))
                    SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                    FEATURES.append(names)
                    TIME.append((time_stop-time_start)/60)
                    ID.append(id)
                    i+=1
                    print((time_stop-time_start)/60)
                        
#                     except:
#                         time_stop = time.perf_counter()
#                         MODEL.append(m)
#                         CATEGORICAL.append(c)
#                         NUMERIC.append(n)
#                         OUTLIER.append(o)
#                         SCALER.append(s)
#                         RMSE_TR.append(".")
#                         MAE_TR.append(".")
#                         R2_TR.append(".")
#                         MAPE_TR.append(".")
#                         RMSE_TE.append(".")
#                         MAE_TE.append(".")
#                         R2_TE.append(".")
#                         MAPE_TE.append(".")
#                         TIME.append((time_stop-time_start)/60)
#                         ID.append(id)
#                         TYPE.append(".")
#                         i+=1

BayesianRidge()
None
MeanEncoder(variables=['bpg', 'store_number', 'sku_number', 'mpog_id'])
Winsorizer(tail='both',
           variables=['pop_est', 'pop_density', 'total_vio',
                      'avg_cluster_unit_sales', 'adjusted_avg_cluster_sales',
                      'avg_cluster_total_sales', 'sales_signal',
                      'failure_sales', 'lifecycle', 'adjusted_lifecycle',
                      'adj_avg_cluster_total_sales', 'unit_sales',
                      'projected_growth_pct', 'other_unit_pls_lost_sales_py',
                      'other_unit_pls_lost_sales', 'weighted_lookup_cnt',
                      'qty_wt0_ppy', 'ntrans_wt0_ppy', 'qty_wt0_py',
                      'ntrans_wt0_py', 'ntrans_wt0', 'unadjusted_total_vio',
                      'vio_compared_to_cluster', 'qty_wt0', 'pct_white', 'age',
                      'pct_college', 'pct_blue_collar',
                      'median_household_income', 'establishments', ...])
[Pipeline] ........... (step 1 

In [57]:
results1 = pd.DataFrame({"ID":ID,
                        "TYPE":TYPE,
                       "TIME":TIME,
                        "MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        #"SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       #"P_value":PVAL,
                       #"IMPORTANCE":IMPORTANCE,
                       #"FEATURES":FEATURES
                       })

In [58]:
results1

Unnamed: 0,ID,TYPE,TIME,MODEL,NUMERIC,CATEGORICAL,OUTLIER,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE
0,G1,Not Sure,0.080585,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",183.85958,43.26732,0.523037,59.332829,59.50048,163.300342,42.544093,0.576435,58.578134,59.198082
1,G2,Linear,0.087437,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",183.862363,43.205695,0.523023,58.875856,59.055654,163.294889,42.480847,0.576464,58.112036,58.736645
2,G3,Not Sure,0.079856,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",183.859553,43.267367,0.523037,59.333302,59.501074,163.300611,42.544296,0.576434,58.578684,59.19878
3,G4,Linear,0.081466,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",183.862337,43.205734,0.523023,58.876259,59.056214,163.295157,42.481049,0.576462,58.112529,58.737293
4,G5,Not Sure,0.345497,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",214.00196,68.473209,0.353829,162.722532,87.063412,194.516655,67.892167,0.399021,159.818076,86.421997
5,G6,Linear,0.371429,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",214.010442,68.438731,0.353778,162.422176,86.968047,194.504536,67.861066,0.399096,159.515924,86.342899
6,G7,Not Sure,0.387754,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",216.46605,70.798376,0.338863,174.742502,89.842498,197.155522,70.467871,0.382605,172.545329,89.400883
7,G8,Linear,0.36661,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",216.472824,70.767806,0.338821,174.452855,89.791579,197.151021,70.447326,0.382633,172.28624,89.350276


In [59]:
final1=pd.merge(results1,d,how = "left",on="ID")

In [60]:
final1

Unnamed: 0,ID,TYPE,TIME,MODEL,NUMERIC,CATEGORICAL,OUTLIER,RMSE_TR,MAE_TR,R2_TR,...,pct_college,pct_blue_collar,median_household_income,establishments,road_quality_index,lifecycle_pre_peak_post,trend,unit_sales,other_unit_pls_lost_sales_py,qty_wt0_ppy
0,G1,Not Sure,0.080585,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",183.85958,43.26732,0.523037,...,,,,,,,,,,
1,G2,Linear,0.087437,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",183.862363,43.205695,0.523023,...,0.9535734578140322,0.0213319260226925,0.0,0.0001990411041354,1.3902119455622142e-08,0.163374596041002,0.0,,,
2,G3,Not Sure,0.079856,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",183.859553,43.267367,0.523037,...,,,,,,,,,,
3,G4,Linear,0.081466,BayesianRidge(),,"MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",183.862337,43.205734,0.523023,...,0.9535191131067192,0.0211535754613982,0.0,0.000199785629821,1.3964879919114992e-08,0.1633601193403477,0.0,,,
4,G5,Not Sure,0.345497,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",214.00196,68.473209,0.353829,...,,,,,,,,,,
5,G6,Linear,0.371429,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",214.010442,68.438731,0.353778,...,0.2973518663104322,1.0833541819188497e-07,0.0,,0.0,0.2611160075233776,0.0,8.215650382226158e-15,0.0,0.0
6,G7,Not Sure,0.387754,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",216.46605,70.798376,0.338863,...,,,,,,,,,,
7,G8,Linear,0.36661,BayesianRidge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(variables=['pop_est', 'pop_density'...",216.472824,70.767806,0.338821,...,0.3125908392767491,4.440892098500626e-16,0.0,,0.0,0.5900822513676567,0.0,2.6867397195928788e-14,0.0,0.0


In [54]:
final.to_csv('final_1.csv')

In [13]:
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder(variables = categories)
      ]
num = [#None,
       #tran.YeoJohnsonTransformer(variables= continuous),
       #prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
       prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
      ]
mod = [#Ridge(), 
       #LinearRegression(),
       #LassoLars(alpha=0.1),
       #BayesianRidge(),
       GradientBoostingRegressor(random_state=0),
      ]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "gaussian", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "right",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "iqr", tail = "left",fold=3,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "right",fold=.05,variables=continuous),
           out.Winsorizer(capping_method = "quantiles", tail = "left",fold=.05,variables=continuous),
           None
          ]

scale = [True,False]

d = pd.DataFrame()
TYPE = []
MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []

RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []

RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []

PVAL = []
IMPORTANCE = []
FEATURES = []
TIME = []
ID = []
i=1

for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                for s in [True,False]:
                    id = "G"+str(i)
                    time_start = time.perf_counter()
                    X_tr = X_train.copy()
                    X_te = X_test.copy()
                    print(m)
                    print(n)
                    print(c)
                    print(o)
                    #print(s)
                    pipe = Pipeline([
                            ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                            ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                            ("cat_encode",c),
                            ("num_encode",n),
                            ("outlier", o),
                            ("filter_corr", select.DropCorrelatedFeatures(threshold=0.84))
                        ],verbose=True)
                        #try:
                    pipe.fit(X_tr,y_train)
                    X_tr = pipe.transform(X_tr)
                    X_te = pipe.transform(X_te)
                    names = X_te.columns
                    if s:
                        scale = prep.StandardScaler()
                        scale.fit(X_tr,y_train)
                        X_tr = scale.transform(X_tr)
                        X_te = scale.transform(X_te)
                    model = m
                    model.fit(X_tr,y_train)
                    print("Fit")
                    y_tr_pred = model.predict(X_tr)
                    y_te_pred = model.predict(X_te)
                    try:
                        importance = model.feature_importances_
                        TYPE.append("Tree")
                        importance = np.append(importance,id)
                        names = np.append(names,"ID")
                        d2 = pd.DataFrame([importance],columns = names)
                        d=pd.concat([d,d2]) 
                    except:
                        try:
                            importance = stats.coef_pval(model, X_tr, y_train)
                            importance = np.delete(importance, 0)
                            importance = np.append(importance,id)
                            names = np.append(names,"ID")
                            TYPE.append("Linear")
                            d2 = pd.DataFrame([importance],columns = names)
                            d=pd.concat([d,d2]) 
                        except:
                            TYPE.append("Not Sure")

                    time_stop = time.perf_counter()
                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    #SCALER.append(s)
                    RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                    MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                    R2_TR.append(r2_score(y_train,y_tr_pred))
                    MAPE_TR.append(MAPE(y_train,y_tr_pred))
                    SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                    RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                    MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                    R2_TE.append(r2_score(y_test,y_te_pred))
                    MAPE_TE.append(MAPE(y_test,y_te_pred))
                    SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                    FEATURES.append(names)
                    TIME.append((time_stop-time_start)/60)
                    ID.append(id)
                    i+=1
                    print((time_stop-time_start)/60)
                        
#                     except:
#                         time_stop = time.perf_counter()
#                         MODEL.append(m)
#                         CATEGORICAL.append(c)
#                         NUMERIC.append(n)
#                         OUTLIER.append(o)
#                         SCALER.append(s)
#                         RMSE_TR.append(".")
#                         MAE_TR.append(".")
#                         R2_TR.append(".")
#                         MAPE_TR.append(".")
#                         RMSE_TE.append(".")
#                         MAE_TE.append(".")
#                         R2_TE.append(".")
#                         MAPE_TE.append(".")
#                         TIME.append((time_stop-time_start)/60)
#                         ID.append(id)
#                         TYPE.append(".")
#                         i+=1

GradientBoostingRegressor(random_state=0)
FunctionTransformer(func=<function ratios at 0x000001AA9E096790>,
                    kw_args={'tuples': False,
                             'variables': ['pop_est', 'pop_density',
                                           'total_vio',
                                           'avg_cluster_unit_sales',
                                           'adjusted_avg_cluster_sales',
                                           'avg_cluster_total_sales',
                                           'sales_signal', 'failure_sales',
                                           'lifecycle', 'adjusted_lifecycle',
                                           'adj_avg_cluster_total_sales',
                                           'unit_sales', 'projected_growth_pct',
                                           'other_unit_pls_lost_sales_py',
                                           'other_unit_pls_lost_sales',
                                           'weighted_

MemoryError: Unable to allocate 1.78 GiB for an array with shape (1089, 219492) and data type float64

In [None]:
results2 = pd.DataFrame({"ID":ID,
                        "TYPE":TYPE,
                        "TIME":TIME,
                        "MODEL":MODEL,
                       "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        "SCALER":SCALER,
                       "RMSE_TR":RMSE_TR,
                       "MAE_TR":MAE_TR,
                       "R2_TR":R2_TR,
                       "MAPE_TR":MAPE_TR,
                       "SMAPE_TR":SMAPE_TR,
                       "RMSE_TE":RMSE_TE,
                       "MAE_TE":MAE_TE,
                       "R2_TE":R2_TE,
                       "MAPE_TE":MAPE_TE,
                       "SMAPE_TE":SMAPE_TE,
                       #"P_value":PVAL,
                       #"IMPORTANCE":IMPORTANCE,
                       #"FEATURES":FEATURES
                       })

In [None]:
results2

In [None]:
final2=pd.merge(results2,d,how = "left",on="ID")

In [None]:
final2

In [None]:
final.to_csv('final_2.csv')

In [1]:
# Loop-d-Loop
# Right now, I am doing all categories the same, might change variables later
cat = [#ce.OneHotEncoder(top_categories=None,drop_last=True),
       ce.CountFrequencyEncoder(encoding_method = "count"),
      ce.MeanEncoder()
      ]
num = [tran.YeoJohnsonTransformer(variables= continuous),       
      prep.FunctionTransformer(log_transform, kw_args={"variables":PosContinuous}),
      prep.FunctionTransformer(ratios, kw_args={"variables" : continuous, "tuples" : False}),
      None
      ]
mod = [Ridge(), 
       LinearRegression(),
       #LassoLars(alpha=0.1),
       BayesianRidge(),
       GradientBoostingRegressor(random_state=0)
      ]
outlier = [out.Winsorizer(capping_method = "gaussian", tail = "both",fold=3,variables=continuous),
      out.Winsorizer(capping_method = "iqr", tail = "both",fold=3,variables=continuous),
      out.Winsorizer(capping_method = "quantiles", tail = "both",fold=.05,variables=continuous),
      None]
           
scale = [True,False]
           
d = pd.DataFrame()
TYPE = []
MODEL = []
CATEGORICAL = []
NUMERIC = []
OUTLIER = []
SCALER = []
FEATURE_SELECTION = []
           
RMSE_TR = []
MAE_TR = []
R2_TR = []
MAPE_TR = []
SMAPE_TR = []
           
RMSE_TE = []
MAE_TE = []
R2_TE = []
MAPE_TE = []
SMAPE_TE = []
           
PVAL = []
IMPORTANCE = []
FEATURES = []
TIME = []
ID = []
N_COMPONENTS = []
           
i=1
for m in mod: # loop through the models
    for n in num: # loop through the numeric transformations
        for c in cat: # loop through the categorical encoding
            for o in outlier:
                for s in [True,False]:
#                     if o:
#                         print(i)
#                         i=i+1
#                         continue
                    if not s:
                        i+=1
                        continue
                    id = "G"+str(i)
                    time_start = time.perf_counter()
                    X_tr = X_train.copy()
                    X_te = X_test.copy()
                    print(m)
                    print(n)
                    print(c)
                    print(o)
                    print(s)
                    pipe = Pipeline([
                        ("makePos", prep.FunctionTransformer(MakePos, kw_args={"negs": NegContinuous})),
                        ("rare", ce.RareLabelEncoder(tol=0.01, n_categories=7, max_n_categories = 6, variables= categories, replace_with='Rare')),
                        ("cat_encode",c),
                        ("num_encode",n),
                        ("outlier", o),
                        ("filter_corr", select.DropCorrelatedFeatures(threshold=0.85))
                    ],verbose=True)
                    #try:
                    pipe.fit(X_tr,y_train)
                    X_tr = pipe.transform(X_tr)
                    X_te = pipe.transform(X_te)
                    names = X_te.columns
#                     if s:
#                         scale = prep.StandardScaler()
#                         scale.fit(X_tr,y_train)
#                         X_tr = scale.transform(X_tr)
#                         X_te = scale.transform(X_te)
                    
                    scale = prep.StandardScaler()
                    scale.fit(X_tr,y_train)
                    X_tr = scale.transform(X_tr)
                    X_te = scale.transform(X_te)
                    pca = PCA(n_components=0.95,svd_solver="full")
                    pca.fit(X_tr,y_train)
                    X_tr = pca.transform(X_tr)
                    X_te = pca.transform(X_te)
                    model = m
                    model.fit(X_tr,y_train)
                    print("Fit")
                    y_tr_pred = model.predict(X_tr)
                    y_te_pred = model.predict(X_te)
#                    try:
#                        importance = model.feature_importances_
#                        TYPE.append("Tree")
#                        importance = np.append(importance,id)
#                        names = np.append(names,"ID")
#                        d2 = pd.DataFrame([importance],columns = names)
#                        d=pd.concat([d,d2]) 
#                    except:
#                        try:
#                            importance = stats.coef_pval(model, X_tr, y_train)
#                            importance = np.delete(importance, 0)
#                            importance = np.append(importance,id)
#                            names = np.append(names,"ID")
#                            TYPE.append("Linear")
#                            d2 = pd.DataFrame([importance],columns = names)
#                            d=pd.concat([d,d2]) 
#                        except:
#                            TYPE.append("Not Sure")                        
                    
                    time_stop = time.perf_counter()
                    MODEL.append(m)
                    CATEGORICAL.append(c)
                    NUMERIC.append(n)
                    OUTLIER.append(o)
                    SCALER.append(s)
                    RMSE_TR.append(math.sqrt(mean_squared_error(y_train,y_tr_pred)))
                    MAE_TR.append(mean_absolute_error(y_train,y_tr_pred))
                    R2_TR.append(r2_score(y_train,y_tr_pred))
                    MAPE_TR.append(MAPE(y_train,y_tr_pred))
                    SMAPE_TR.append(SMAPE(y_train,y_tr_pred))
                    RMSE_TE.append(math.sqrt(mean_squared_error(y_test,y_te_pred)))
                    MAE_TE.append(mean_absolute_error(y_test,y_te_pred))
                    R2_TE.append(r2_score(y_test,y_te_pred))
                    MAPE_TE.append(MAPE(y_test,y_te_pred))
                    SMAPE_TE.append(SMAPE(y_test,y_te_pred))
                    FEATURES.append(names)
                    TIME.append((time_stop-time_start)/60)
                    ID.append(id)
                    N_COMPONENTS.append(pca.n_components_)
                    i+=1
                    print((time_stop-time_start)/60)
                        
#                     except:
#                         time_stop = time.perf_counter()
#                         MODEL.append(m)
#                         CATEGORICAL.append(c)
#                         NUMERIC.append(n)
#                         OUTLIER.append(o)
#                         SCALER.append(s)
#                         RMSE_TR.append(".")
#                         MAE_TR.append(".")
#                         R2_TR.append(".")
#                         MAPE_TR.append(".")
#                         RMSE_TE.append(".")
#                         MAE_TE.append(".")
#                         R2_TE.append(".")
#                         MAPE_TE.append(".")
#                         TIME.append((time_stop-time_start)/60)
#                         ID.append(id)
#                         TYPE.append(".")
#                         i+=1

NameError: name 'ce' is not defined

In [44]:
results5 = pd.DataFrame({"ID":ID,
                        #"TYPE":TYPE,
                        "TIME":TIME,
                        "MODEL":MODEL,
                        "NUMERIC":NUMERIC,
                        "CATEGORICAL":CATEGORICAL,
                        "OUTLIER":OUTLIER,
                        "N_COMPONENTS":N_COMPONENTS,
                        "SCALER":SCALER,
                        "RMSE_TR":RMSE_TR,
                        "MAE_TR":MAE_TR,
                        "R2_TR":R2_TR,
                        "MAPE_TR":MAPE_TR,
                        "SMAPE_TR":SMAPE_TR,
                        "RMSE_TE":RMSE_TE,
                        "MAE_TE":MAE_TE,
                        "R2_TE":R2_TE,
                        "MAPE_TE":MAPE_TE,
                        "SMAPE_TE":SMAPE_TE,
                        #"P_value":PVAL,
                        #"IMPORTANCE":IMPORTANCE,
                        #"FEATURES":FEATURES
                        })

In [45]:
results5

Unnamed: 0,ID,TIME,MODEL,NUMERIC,CATEGORICAL,OUTLIER,N_COMPONENTS,SCALER,RMSE_TR,MAE_TR,R2_TR,MAPE_TR,SMAPE_TR,RMSE_TE,MAE_TE,R2_TE,MAPE_TE,SMAPE_TE
0,G1,0.327285,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","OneHotEncoder(drop_last=True,\n v...","Winsorizer(tail='both',\n variables=...",27,True,217.00025,68.189885,0.335596,171.664396,86.651196,197.569529,67.766449,0.380009,170.528586,86.401902
1,G3,0.344185,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","OneHotEncoder(drop_last=True,\n v...","Winsorizer(capping_method='iqr', tail='both',\...",25,True,221.512088,76.339528,0.30768,211.11558,93.0769,202.521906,76.092985,0.348537,210.213357,92.836883
2,G5,0.33985,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","OneHotEncoder(drop_last=True,\n v...","Winsorizer(capping_method='quantiles', fold=0....",27,True,223.331325,66.927721,0.296262,150.425962,82.300001,204.258793,66.504834,0.337315,149.295903,81.981517
3,G7,0.320527,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","OneHotEncoder(drop_last=True,\n v...",,28,True,216.033421,69.396946,0.341503,183.975731,88.088654,196.828791,69.061307,0.384649,182.801994,87.787503
4,G9,0.310853,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(tail='both',\n variables=...",22,True,217.923653,68.658906,0.329929,175.441462,86.691819,199.007394,68.300848,0.370952,174.12922,86.387607
5,G11,0.317896,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='iqr', tail='both',\...",20,True,222.974704,77.376613,0.298507,216.752894,93.612652,204.488835,77.25564,0.335822,215.973299,93.379421
6,G13,0.326693,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","CountFrequencyEncoder(variables=['bpg', 'store...","Winsorizer(capping_method='quantiles', fold=0....",22,True,224.828233,67.502973,0.286796,152.680722,82.092042,206.334843,67.188128,0.323776,151.74873,81.826655
7,G15,0.337568,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","CountFrequencyEncoder(variables=['bpg', 'store...",,23,True,216.777222,69.65451,0.336961,183.659333,87.958941,198.090985,69.402138,0.376732,182.390876,87.705634
8,G17,0.371161,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(tail='both',\n variables=...",22,True,216.851023,67.86527,0.336509,172.860035,85.988307,197.560764,67.492071,0.380064,171.533772,85.671354
9,G19,0.382224,Ridge(),"YeoJohnsonTransformer(variables=['pop_est', 'p...","MeanEncoder(variables=['bpg', 'store_number', ...","Winsorizer(capping_method='iqr', tail='both',\...",20,True,221.859222,76.531577,0.305508,213.114892,93.10602,202.974978,76.352445,0.345619,212.336567,92.864212


In [None]:
results5.to_csv('final_6.csv')