In [59]:
import os
from pathlib import Path, PureWindowsPath
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.api as sm

#Packages related to clustering
from tslearn.clustering import silhouette_score
from sklearn.decomposition import PCA
from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

In [60]:
#! pip install tslearn

In [61]:
os.path.dirname(os.getcwd())

'C:\\Users\\Cornelius\\Cory Dropbox\\Cory LeRoy\\PC\\Documents\\GitHub\\Store-Sales'

In [62]:
path_cwd = Path(PureWindowsPath(os.path.dirname(os.getcwd())))
path_cwd

WindowsPath('C:/Users/Cornelius/Cory Dropbox/Cory LeRoy/PC/Documents/GitHub/Store-Sales')

In [63]:
path = path_cwd / 'data'
path

WindowsPath('C:/Users/Cornelius/Cory Dropbox/Cory LeRoy/PC/Documents/GitHub/Store-Sales/data')

In [64]:
df_train = pd.read_csv(path / 'train.csv')
df_transaction = pd.read_csv(path / 'transactions.csv')
df_holidays = pd.read_csv(path / 'holidays_events.csv')
df_oil = pd.read_csv(path / 'oil.csv')
df_stores = pd.read_csv(path / 'stores.csv')
df_test = pd.read_csv(path / 'test.csv')

## Clean Data ##

In [65]:
#rename oil to be used for missing value interpolation
new_oil = df_oil.copy()
new_oil.rename(columns={'dcoilwtico': 'oil_price'}, inplace=True)

In [66]:
# Assuming df is your DataFrame and 'column_name' is the name of the column
null_count = new_oil['oil_price'].isnull().sum()

# This will give you the count of null values in the 'column_name' column
print("Count of null values in 'oil_price':", null_count)

Count of null values in 'oil_price': 43


In [67]:
new_oil['oil_price'] = new_oil['oil_price'].interpolate(method='index')

In [68]:
# Assuming df is your DataFrame and 'column_name' is the name of the column
null_count = new_oil['oil_price'].isnull().sum()

# This will give you the count of null values in the 'column_name' column
print("Count of null values in 'oil_price':", null_count)

Count of null values in 'oil_price': 1


In [69]:
new_oil.at[0, 'oil_price'] = 93.14

In [70]:
# Assuming df is your DataFrame and 'column_name' is the name of the column
null_count = new_oil['oil_price'].isnull().sum()

# This will give you the count of null values in the 'column_name' column
print("Count of null values in 'oil_price':", null_count)

Count of null values in 'oil_price': 0


#### shorten date ####

In [71]:
def train_to_store_merge(train, store):
    df_train_store_merged = pd.merge(train, store, how='left', on='store_nbr')
    return df_train_store_merged

In [72]:
# train clean

df_train['date'] = pd.to_datetime(df_train['date'])
first_sale_date_per_store = df_train[df_train['sales'] > 0].groupby('store_nbr')['date'].min().reset_index()

# remove rows before stores were open. only do this to train
df_train_min_date = pd.merge(df_train, first_sale_date_per_store, on='store_nbr')
df_train_shortened = df_train_min_date[df_train_min_date['date_x'] >= df_train_min_date['date_y']] 
df_train_shortened = df_train_shortened.drop(['date_y'], axis=1)
df_train_shortened.rename(columns={'date_x':'date'}, inplace=True)
df_train_shortened = train_to_store_merge(df_train_shortened, df_stores)

# remove dates when stores were temporarily closed
df_train_short = df_train_shortened[~((df_train_shortened['date']>='2016-08-22') &(df_train_shortened['date']<='2016-10-26') &(df_train_shortened['store_nbr']==25))]
df_train_short =df_train_short[~((df_train_short['date']>='2014-04-14') &(df_train_short['date']<='2014-07-23') &(df_train_short['store_nbr']==24))]
df_train_short =df_train_short[~((df_train_short['date']>='2013-07-08') &(df_train_short['date']<='2013-07-30') &(df_train_short['store_nbr']==30))]
df_train_short =df_train_short[~((df_train_short['date']>='2014-08-04') &(df_train_short['date']<='2014-09-10') &(df_train_short['store_nbr']==14))]
df_train_short =df_train_short[~((df_train_short['date']>='2015-03-30') &(df_train_short['date']<='2015-05-28') &(df_train_short['store_nbr']==12))]
df_train_shortened =df_train_short[~((df_train_short['date']>='2016-08-15') &(df_train_short['date']<='2016-12-02') &(df_train_short['store_nbr']==18))]

In [73]:
def create_date_features(df):
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    return df

#### holidays feature eng ####

In [74]:
# holidays 

df_holidays['date'] = pd.to_datetime(df_holidays['date'])
df_holidays_real = df_holidays[df_holidays['transferred']==False] 

#unique_holis2 = df_holidays_real['description'].drop_duplicates()
unique_holis2 = df_holidays_real[['description','locale']].drop_duplicates()
unique_holis_national = unique_holis2[unique_holis2['locale']=='National'].drop(['locale'],axis=1)
unique_holis_city = unique_holis2[unique_holis2['locale']=='Local'].drop(['locale'],axis=1)
unique_holis_state = unique_holis2[unique_holis2['locale']=='Regional'].drop(['locale'],axis=1)

national_holidays = df_holidays_real[df_holidays_real['locale']=='National'].loc[:,('date','description')]
local_holidays = df_holidays_real[df_holidays_real['locale']=='Local'].loc[:,('date','description','locale_name')]
state_holidays = df_holidays_real[df_holidays_real['locale']=='Regional'].loc[:,('date','description','locale_name')]

df_train_summed_daily = df_train_shortened.groupby(['date','city','state']).agg({'onpromotion':'sum', 'sales':'sum'}).reset_index()


In [75]:
df_holi = pd.merge(df_train_summed_daily, national_holidays, how='left', on='date')
df_holi = pd.merge(df_holi, state_holidays, how='left', left_on=['date', 'state'], right_on=['date','locale_name'])
df_holi = pd.merge(df_holi, local_holidays, how='left', left_on=['date', 'city'], right_on=['date','locale_name'])

df_holi = df_holi.drop(['locale_name_x','locale_name_y'],axis=1)
df_holi = df_holi.rename(columns = {'description_x':'national_holiday','description_y':'state_holiday','description':'city_holiday'})

In [76]:
# set boolean column for each unique holiday. still have dups

for holiday in unique_holis_national['description'].tolist():
    df_holi[holiday] = df_holi['national_holiday'] == holiday
for holiday in unique_holis_state['description'].tolist():
    df_holi[holiday] = df_holi['state_holiday'] == holiday
for holiday in unique_holis_city['description'].tolist():
    df_holi[holiday] = df_holi['city_holiday'] == holiday

df_holi = df_holi.drop(['national_holiday','state_holiday','city_holiday'] ,axis=1)

  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday
  df_holi[holiday] = df_holi['city_holiday'] == holiday


In [77]:
# combines duplicates rows into 1 rows where there is a true for each holiday that falls on the given date
unique_holis_list = list(unique_holis2['description'])
agg_func = {col: 'any' for col in unique_holis_list}

aggregated_df = df_holi.groupby(['date', 'sales', 'city', 'state', 'onpromotion']).agg(agg_func).reset_index()


aggregated_df = pd.get_dummies(aggregated_df, columns=['city', 'state'], prefix=['city', 'state'])

In [78]:
#OLS to determine impact of each holiday across all stores

In [79]:
# seasonality has big impact on predicted sales so need to add some variables to capture 
aggregated_df_ols = aggregated_df.copy()
aggregated_df_ols['dow'] = aggregated_df_ols['date'].dt.dayofweek
aggregated_df_ols['month'] = aggregated_df_ols['date'].dt.month
aggregated_df_ols['year'] = aggregated_df_ols['date'].dt.year
aggregated_df_ols = aggregated_df_ols.drop('date',axis=1)

In [80]:
#model seasonality with dummy vars... do i need this?
aggregated_df_ols_dummies =  pd.get_dummies(aggregated_df_ols, columns=['dow','month', 'year'], drop_first=True)

#### add oil interpolated, lag_1, rolling mean ####

In [81]:
def add_lag_rolling(aggregated_df_ols_dummies):
        
# Lag sales by 1 to get previous day of sales value
    aggregated_df_ols_dummies['lag_1'] = aggregated_df_ols_dummies['sales'].shift(1)

# Add rolling mean for 7-day window
    aggregated_df_ols_dummies['rolling_mean'] = aggregated_df_ols_dummies['sales'].rolling(window=7).mean()


    aggregated_df_ols_dummies['lag_1'].fillna(aggregated_df_ols_dummies['sales'], inplace=True)
    aggregated_df_ols_dummies['rolling_mean'].fillna(aggregated_df_ols_dummies['sales'],inplace=True)
    return aggregated_df_ols_dummies

In [82]:
aggregated_df_ols_dummies = add_lag_rolling(aggregated_df_ols_dummies)

#### Further Modeling ####

In [83]:
X = aggregated_df_ols_dummies.drop('sales',axis=1)
# add constant for linear regression
X = sm.add_constant(X)
X = X.astype(int)
y=aggregated_df_ols_dummies['sales']

In [84]:
model_OLS = sm.OLS(y,X).fit()
model_summary = model_OLS.summary()

In [85]:
X

Unnamed: 0,const,onpromotion,Fundacion de Manta,Provincializacion de Cotopaxi,Fundacion de Cuenca,Cantonizacion de Libertad,Cantonizacion de Riobamba,Cantonizacion del Puyo,Cantonizacion de Guaranda,Provincializacion de Imbabura,...,month_9,month_10,month_11,month_12,year_2014,year_2015,year_2016,year_2017,lag_1,rolling_mean
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2511,2511
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2511,2978
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2978,4973
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4973,5316
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5316,5615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35272,1,394,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,26808,21080
35273,1,621,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,28950,23269
35274,1,605,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,30309,26187
35275,1,1682,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,35793,38414


In [86]:
# remove features that have p value >.10 
# ... no xmas day in trainset
pvalues = model_OLS.pvalues
alpha = .05
drop_columns = pvalues.to_frame(name='feature').reset_index()
drop_columns = drop_columns[(~drop_columns['index'].str.contains('city')) \
                            & (~drop_columns['index'].str.contains('month')) \
                            & (~drop_columns['index'].str.contains('year')) \
                            & (~drop_columns['index'].str.contains('state')) \
                            & (drop_columns['feature']>alpha) \
                           ]
#drop_columns = pvalues[(pvalues>alpha)]
drop_cols = list(drop_columns['index'])
drop_cols

['Fundacion de Manta',
 'Provincializacion de Cotopaxi',
 'Fundacion de Cuenca',
 'Cantonizacion de Libertad',
 'Cantonizacion de Riobamba',
 'Cantonizacion del Puyo',
 'Cantonizacion de Guaranda',
 'Provincializacion de Imbabura',
 'Cantonizacion de Latacunga',
 'Fundacion de Machala',
 'Fundacion de Santo Domingo',
 'Cantonizacion de El Carmen',
 'Cantonizacion de Cayambe',
 'Fundacion de Esmeraldas',
 'Primer Grito de Independencia',
 'Fundacion de Riobamba',
 'Fundacion de Ambato',
 'Fundacion de Ibarra',
 'Cantonizacion de Quevedo',
 'Traslado Independencia de Guayaquil',
 'Dia de Difuntos',
 'Provincializacion de Santo Domingo',
 'Provincializacion Santa Elena',
 'Independencia de Guaranda',
 'Independencia de Latacunga',
 'Independencia de Ambato',
 'Fundacion de Quito-1',
 'Fundacion de Loja',
 'Puente Navidad',
 'Navidad',
 'Navidad+1',
 'Puente Primer dia del ano',
 'Primer dia del ano-1',
 'Recupero puente Navidad',
 'Recupero puente primer dia del ano',
 'Viernes Santo',
 '

In [87]:
#values2 = pd.DataFrame(pvalues).reset_index()
#pvalues2.rename(columns={'index':'holiday',0:'p'} , inplace=True)
#pvalues2

In [88]:
print(len(drop_cols), 'holidays dropped out of', len(unique_holis2),'.', len(unique_holis2)-len(drop_cols) ,'unique holidays remain' )

85 holidays dropped out of 103 . 18 unique holidays remain


In [89]:
df_holi_shortened = df_holidays_real[~df_holidays_real['description'].isin(drop_cols)]
df_holi_shortened = df_holi_shortened.drop(['type','locale','locale_name','transferred'],axis=1)

In [90]:
# unique holidays from the reduced holidays list
unique_holidays = df_holi_shortened['description'].unique()
filtered_holidays = df_holidays[df_holidays['description'].isin(unique_holidays)]

In [91]:
#separate these out because they have differenct merge conditions
national_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='National'].loc[:,('date','description')]
local_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='Local'].loc[:,('date','description','locale_name')]
state_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='Regional'].loc[:,('date','description','locale_name')]

In [92]:
# unique holidays from the reduced holidays list
unique_holidays = df_holi_shortened['description'].unique()

#### merge train to holidays ####

In [93]:
# unique holidays from the reduced holidays list
filtered_holidays = df_holidays[df_holidays['description'].isin(unique_holidays)]
#separate locales out because they have differenct merge conditions
national_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='National'].loc[:,('date','description')]
local_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='Local'].loc[:,('date','description','locale_name')]
state_holidays_filtered = filtered_holidays[filtered_holidays['locale']=='Regional'].loc[:,('date','description','locale_name')]

In [94]:
def train_to_holiday_merge(train, national_holidays_filtered, state_holidays_filtered, local_holidays_filtered):
    train['date'] = pd.to_datetime(train['date'])
    df_train_filtered = pd.merge(train, national_holidays_filtered, how='left', on='date')
    df_train_filtered = pd.merge(df_train_filtered, state_holidays_filtered, how='left', left_on=['date', 'state'], right_on=['date','locale_name'])
    df_train_filtered = pd.merge(df_train_filtered, local_holidays_filtered, how='left', left_on=['date', 'city'], right_on=['date','locale_name'])
    df_train_filtered['holiday'] = df_train_filtered['description_x'].combine_first(df_train_filtered['description_y']).combine_first(df_train_filtered['description'])

    df_train_filtered = df_train_filtered.drop(['locale_name_x','locale_name_y','description','description_x','description_y'],axis=1)
    return df_train_filtered

In [95]:
def train_to_oil_merge(train, oil):
    df_oil['date'] = pd.to_datetime(df_oil['date'])
    df = pd.merge(train,oil, how='left', on='date')
    return df

### K-Means Clustering

In [96]:
df_daily_sales_by_family = df_train_shortened.groupby(['date', 'family'])['sales'].sum().reset_index()

In [97]:
# Convert date column to datetime if it's not already in datetime format
df_daily_sales_by_family['date'] = pd.to_datetime(df_daily_sales_by_family['date'])

# Pivot the data
df_pivot = df_daily_sales_by_family.pivot(index='date', columns='family', values='sales').fillna(0)

df_pivot = df_pivot.T

XTrain = df_pivot.to_numpy()
XCategories = df_pivot.index

seed = 0
np.random.seed(seed)
x_train = TimeSeriesScalerMeanVariance().fit_transform(XTrain)
sz=x_train.shape[1]

n_clusters = 6
sz = x_train.shape[1]

km = TimeSeriesKMeans(n_clusters=n_clusters, verbose=False, random_state=seed)
y_pred_km = km.fit_predict(x_train)

myDict = {}

for i in range(len(XCategories)):
    key = XCategories[i]
    value = y_pred_km[i]
    myDict[key] = value


def familyCluster(fam):
    return myDict.get(fam)

df_train_shortened['familycluster'] = df_train_shortened['family'].apply(lambda x: familyCluster(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_shortened['familycluster'] = df_train_shortened['family'].apply(lambda x: familyCluster(x))


In [98]:
#### combine train ####
df_train_filtered = train_to_holiday_merge(df_train_shortened, national_holidays_filtered, state_holidays_filtered,local_holidays_filtered)
df_train_filtered = create_date_features(df_train_filtered)
df_train_merged = train_to_oil_merge(df_train_filtered, df_oil)

df_train_merged = df_train_merged.drop(['id','city','state', 'type'], axis=1)

df_train_merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['date'] = pd.to_datetime(train['date'])


Unnamed: 0,date,store_nbr,family,sales,onpromotion,cluster,familycluster,holiday,day_of_week,month,year,dcoilwtico
0,2013-01-02,1,AUTOMOTIVE,2.000,0,13,5,,2,1,2013,93.14
1,2013-01-02,1,BABY CARE,0.000,0,13,0,,2,1,2013,93.14
2,2013-01-02,1,BEAUTY,2.000,0,13,5,,2,1,2013,93.14
3,2013-01-02,1,BEVERAGES,1091.000,0,13,0,,2,1,2013,93.14
4,2013-01-02,1,BOOKS,0.000,0,13,5,,2,1,2013,93.14
...,...,...,...,...,...,...,...,...,...,...,...,...
2765692,2017-08-15,9,POULTRY,438.133,0,6,3,,1,8,2017,47.57
2765693,2017-08-15,9,PREPARED FOODS,154.553,1,6,3,,1,8,2017,47.57
2765694,2017-08-15,9,PRODUCE,2419.729,148,6,0,,1,8,2017,47.57
2765695,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,6,5,,1,8,2017,47.57


In [108]:
df_train_dummies = pd.get_dummies(df_train_merged, columns=['cluster','holiday','familycluster']) ## add family pca here maybe?
# if we arent getting expected results, create dummies for date features
df_train_dummies.drop(columns=['family])
df_train_dummies

Unnamed: 0,date,store_nbr,family,sales,onpromotion,day_of_week,month,year,dcoilwtico,cluster_1,...,holiday_Terremoto Manabi+3,holiday_Terremoto Manabi+4,holiday_Terremoto Manabi+5,holiday_Traslado Primer dia del ano,familycluster_0,familycluster_1,familycluster_2,familycluster_3,familycluster_4,familycluster_5
0,2013-01-02,1,AUTOMOTIVE,2.000,0,2,1,2013,93.14,False,...,False,False,False,False,False,False,False,False,False,True
1,2013-01-02,1,BABY CARE,0.000,0,2,1,2013,93.14,False,...,False,False,False,False,True,False,False,False,False,False
2,2013-01-02,1,BEAUTY,2.000,0,2,1,2013,93.14,False,...,False,False,False,False,False,False,False,False,False,True
3,2013-01-02,1,BEVERAGES,1091.000,0,2,1,2013,93.14,False,...,False,False,False,False,True,False,False,False,False,False
4,2013-01-02,1,BOOKS,0.000,0,2,1,2013,93.14,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765692,2017-08-15,9,POULTRY,438.133,0,1,8,2017,47.57,False,...,False,False,False,False,False,False,False,True,False,False
2765693,2017-08-15,9,PREPARED FOODS,154.553,1,1,8,2017,47.57,False,...,False,False,False,False,False,False,False,True,False,False
2765694,2017-08-15,9,PRODUCE,2419.729,148,1,8,2017,47.57,False,...,False,False,False,False,True,False,False,False,False,False
2765695,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1,8,2017,47.57,False,...,False,False,False,False,False,False,False,False,False,True


In [109]:
df_train_dummies.to_pickle('df_train.pkl')

## Transformations on test data ##

In [110]:
df_test_transformed = train_to_store_merge(df_test, df_stores)

df_test_transformed = train_to_holiday_merge(df_test_transformed, national_holidays_filtered, state_holidays_filtered, local_holidays_filtered)
df_test_transformed = create_date_features(df_test_transformed)
df_test_transformed = train_to_oil_merge(df_test_transformed, df_oil)

df_test_transformed = df_test_transformed.drop(['id','city','state', 'type'], axis=1)
df_test_transformed = create_date_features(df_test_transformed)

df_test_transformed['familycluster'] = df_test_transformed['family'].apply(lambda x: familyCluster(x))

df_test_transformed

Unnamed: 0,date,store_nbr,family,onpromotion,cluster,holiday,day_of_week,month,year,dcoilwtico
0,2017-08-16,1,AUTOMOTIVE,0,13,,2,8,2017,46.80
1,2017-08-16,1,BABY CARE,0,13,,2,8,2017,46.80
2,2017-08-16,1,BEAUTY,2,13,,2,8,2017,46.80
3,2017-08-16,1,BEVERAGES,20,13,,2,8,2017,46.80
4,2017-08-16,1,BOOKS,0,13,,2,8,2017,46.80
...,...,...,...,...,...,...,...,...,...,...
28507,2017-08-31,9,POULTRY,1,6,,3,8,2017,47.26
28508,2017-08-31,9,PREPARED FOODS,0,6,,3,8,2017,47.26
28509,2017-08-31,9,PRODUCE,1,6,,3,8,2017,47.26
28510,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,6,,3,8,2017,47.26


Unnamed: 0,date,store_nbr,family,onpromotion,cluster,holiday,day_of_week,month,year,dcoilwtico,familycluster
0,2017-08-16,1,AUTOMOTIVE,0,13,,2,8,2017,46.80,5
1,2017-08-16,1,BABY CARE,0,13,,2,8,2017,46.80,0
2,2017-08-16,1,BEAUTY,2,13,,2,8,2017,46.80,5
3,2017-08-16,1,BEVERAGES,20,13,,2,8,2017,46.80,0
4,2017-08-16,1,BOOKS,0,13,,2,8,2017,46.80,5
...,...,...,...,...,...,...,...,...,...,...,...
28507,2017-08-31,9,POULTRY,1,6,,3,8,2017,47.26,3
28508,2017-08-31,9,PREPARED FOODS,0,6,,3,8,2017,47.26,3
28509,2017-08-31,9,PRODUCE,1,6,,3,8,2017,47.26,0
28510,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,6,,3,8,2017,47.26,5


In [111]:
df_test_dummies = pd.get_dummies(df_test_transformed, columns=['cluster','holiday','familycluster', 'month', 'year','day_of_week']) ## add family pca here maybe?

KeyError: "['familycluster'] not in index"

In [112]:
df_test_dummies.to_pickle('df_test.pkl')

In [104]:
#df_test = pd.read_pickle('df_test.pkl')
#df_train = pd.read_pickle('df_train.pkl')