# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
sales_df = pd.read_csv(r'C:\Users\Crist\RossmanStores\Rossmann-Store-Sales-Project\data\train.csv')
store_df = pd.read_csv(r'C:\Users\Crist\RossmanStores\Rossmann-Store-Sales-Project\data\store.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
sales_df.Date = pd.to_datetime(sales_df.Date)


In [4]:
sales_df['SalesPerCustomer'] = sales_df['Sales'] / sales_df['Customers']

avg_store = sales_df.groupby('Store')[['Sales', 'Customers', 'SalesPerCustomer']].mean()
avg_store.rename(columns=lambda x: 'Avg' + x, inplace=True)
sales_df = pd.merge(avg_store.reset_index(), sales_df, on='Store')

In [5]:
def build_features(train, store):
    # Convert string types into integers
    store['StoreType'] = store['StoreType'].astype('category').cat.codes
    store['Assortment'] = store['Assortment'].astype('category').cat.codes
    train["StateHoliday"] = train["StateHoliday"].astype('category').cat.codes

    merged = pd.merge(train, store, on='Store', how='left')
    
    # remove NaNs
    NaN_replace = 0
    merged.fillna(NaN_replace, inplace=True)
    
    merged['Year'] = merged.Date.dt.year
    merged['Month'] = merged.Date.dt.month
    merged['Day'] = merged.Date.dt.day
    merged['Week'] = merged.Date.dt.week
    
    # Number of months that competition has existed for
    merged['MonthsCompetitionOpen'] = \
        12 * (merged['Year'] - merged['CompetitionOpenSinceYear']) + \
        (merged['Month'] - merged['CompetitionOpenSinceMonth'])
    merged.loc[merged['CompetitionOpenSinceYear'] ==
               NaN_replace, 'MonthsCompetitionOpen'] = NaN_replace
    
    # Number of weeks that promotion has existed for
    merged['WeeksPromoOpen'] = \
        12 * (merged['Year'] - merged['Promo2SinceYear']) + \
        (merged['Date'].dt.weekofyear - merged['Promo2SinceWeek'])
    merged.loc[merged['Promo2SinceYear'] == 
               NaN_replace, 'WeeksPromoOpen'] = NaN_replace

    toInt = [
        'CompetitionOpenSinceMonth',
        'CompetitionOpenSinceYear',
        'Promo2SinceWeek', 
        'Promo2SinceYear', 
        'MonthsCompetitionOpen', 
        'WeeksPromoOpen'
    ]
    merged[toInt] = merged[toInt].astype(int)

    return merged

med_store = sales_df.groupby('Store')[['Sales', 'Customers', 'SalesPerCustomer']].median()
med_store.rename(columns=lambda x: 'Med' + x, inplace=True)

store_df = pd.merge(med_store.reset_index(), store_df, on='Store')
features = build_features(sales_df, store_df)
features.head()

Unnamed: 0,Store,AvgSales,AvgCustomers,AvgSalesPerCustomer,DayOfWeek,Date,Sales,Customers,Open,Promo,...,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,Week,MonthsCompetitionOpen,WeeksPromoOpen
0,1,3945.704883,467.646497,8.393038,5,2015-07-31,5263,555,1,1,...,0,0,0,0,2015,7,31,31,82,0
1,1,3945.704883,467.646497,8.393038,4,2015-07-30,5020,546,1,1,...,0,0,0,0,2015,7,30,31,82,0
2,1,3945.704883,467.646497,8.393038,3,2015-07-29,4782,523,1,1,...,0,0,0,0,2015,7,29,31,82,0
3,1,3945.704883,467.646497,8.393038,2,2015-07-28,5011,560,1,1,...,0,0,0,0,2015,7,28,31,82,0
4,1,3945.704883,467.646497,8.393038,1,2015-07-27,6102,612,1,1,...,0,0,0,0,2015,7,27,31,82,0


In [6]:
features.columns

Index(['Store', 'AvgSales', 'AvgCustomers', 'AvgSalesPerCustomer', 'DayOfWeek',
       'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'SalesPerCustomer', 'MedSales', 'MedCustomers',
       'MedSalesPerCustomer', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month',
       'Day', 'Week', 'MonthsCompetitionOpen', 'WeeksPromoOpen'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split
X = [
    'Store', 
    'Customers',
    'CompetitionDistance', 

    'Promo', 
    'Promo2', 

#     'SchoolHoliday',
    'StateHoliday',
    'StoreType',
    'Assortment',

    'AvgSales',
    'AvgCustomers',
    'AvgSalesPerCustomer',
    
    'MedSales',
    'MedCustomers',
    'MedSalesPerCustomer',

    'DayOfWeek',
    'Week',
    'Day',
    'Month',
    'Year',

    'CompetitionOpenSinceMonth',
    'CompetitionOpenSinceYear',
    'Promo2SinceWeek',
    'Promo2SinceYear',

#     'MonthsCompetitionOpen',
#     'WeeksPromoOpen'
]
X_train, X_test, y_train, y_test = train_test_split(
    features[X], features['Sales'], test_size=0.15, random_state=10)

In [14]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
# Error calculating function
def rmspe(y, y_hat):
    return np.sqrt(np.mean(((y - y_hat) / y) ** 2))

rmpse_scorer = make_scorer(rmspe, greater_is_better = False) # Loss function

def score(model, X_train, y_train, y_test, y_hat):
    score = cross_val_score(model, X_train, y_train, scoring=rmpse_scorer, cv=5)
    print('Mean', score.mean())
    print('Variance', score.var())
    print('RMSPE', rmspe(y_test, y_hat))

def plot_importance(model):
    k = list(zip(X, model.feature_importances_))
    k.sort(key=lambda tup: tup[1])

    labels, vals = zip(*k)
    
    plt.barh(np.arange(len(X)), vals, align='center')
    plt.yticks(np.arange(len(X)), labels)

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
adaboost_tree = AdaBoostRegressor(DecisionTreeRegressor())
adaboost_tree.fit(X_train, y_train)
#plot_importance(adaboost_tree)

NameError: name 'plt' is not defined

In [15]:
y_hat = adaboost_tree.predict(X_test)
score(adaboost_tree, X_train, y_train, y_test, y_hat)

KeyboardInterrupt: 

In [16]:
from joblib import dump, load
dump(adaboost_tree, 'adaboost_tree.joblib') 

['adaboost_tree.joblib']