In [None]:
!pip install statsmodels==0.12.0

In [None]:


import pandas as pd
import numpy as np
from fbprophet import Prophet
from tqdm import tqdm, tnrange, trange
from multiprocessing import Pool, cpu_count
from statsmodels.tsa.api import ExponentialSmoothing


In [None]:
from itertools import product
def my_product(inp):
    return [dict(zip(inp.keys(), values)) for values in product(*inp.values())]

pattern = {
    'trend': [None, 'add'],
    'seasonal': [None, 'add'],
}
params = my_product(pattern)

In [None]:


calendar = pd.read_csv('../input/kaggle-afc/calendar_afcs2020.csv', parse_dates=['date'])


In [None]:


def read_sales(filename):
    sales = pd.read_csv(filename)
    sales.set_index('id', inplace=True)
    sales.columns = calendar.date[:len(sales.columns)]
    return sales



In [None]:
%%time
sales = read_sales('../input/kaggle-afc/sales_train_validation_afcs2020.csv')

In [None]:


#metric = 'aic'
#metric = 'aicc'
metric = 'bic'



In [None]:
def fit_es(data):
    data_id, data = data
    first_index = data[data > 0].index[0]
    data = data.loc[first_index:]
    best_score = np.inf
    best_model = None
    for param in params:
        fit = ExponentialSmoothing(data, seasonal_periods=7, initialization_method='estimated', freq='D', **param).fit()
        if metric == 'aic':
            if best_score > fit.aic:
                best_score = fit.aic
                best_model = fit
        elif metric == 'aicc':
            if best_score > fit.aicc:
                best_score = fit.aicc
                best_model = fit
        elif metric == 'bic':
            if best_score > fit.bic:
                best_score = fit.bic
                best_model = fit
        else:
            raise NotImplemntedError()
    f = best_model.forecast(28)
    f = pd.DataFrame([f])
    f.columns = [f'F{i+1}' for i in range(28)]
    f.insert(0, 'id', data_id)
    print (best_model.params)
    return f

In [None]:


def forecast(sales):
    # sales_list = list(sales.head(100).iterrows())
    sales_list = list(sales.iterrows())
    pool = Pool(4)
    result = pool.map(fit_es, sales_list)
    return pd.concat(result)



In [None]:


%%time
sub_valid = forecast(sales)



In [None]:
sub_valid

In [None]:


%%time
sales = read_sales('../input/kaggle-afc/sales_train_evaluation_afcs2020.csv')



In [None]:
sub_eval = forecast(sales)

In [None]:
#submission = pd.concat([sub_valid, sub_eval]).reset_index(drop=True)

In [None]:
sub_valid.shape

In [None]:
sub_valid.to_csv('subaic.csv', index=False, float_format='%.5g')


In [None]:




sales = pd.read_csv('../input/kaggle-afc/sales_train_validation_afcs2020.csv')
prices = pd.read_csv('../input/kaggle-afc/sell_prices_afcs2020.csv')
cal = pd.read_csv('../input/kaggle-afc/calendar_afcs2020.csv')
samp_subm = pd.read_csv('../input/kaggle-afc/sample_submission_afcs2020.csv')



In [None]:


features_cat = ['cat_id', 'state_id']
cal.fillna('empty', inplace=True)
le = LabelEncoder()
for col in features_cat:
    le.fit(sales[col])
    sales[col] = le.transform(sales[col])



In [None]:
features_cat = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
cal.fillna('empty', inplace=True)
le = LabelEncoder()
for col in features_cat:
    le.fit(cal[col])
    cal[col] = le.transform(cal[col])

In [None]:


def predict_article(articles_list):
    skip_days = 0
    results = []
    for article in articles_list:
        #print(article)
        merge_on = ['wm_yr_wk', 'store_id']
        X_train_org['store_id'] = sales.loc[article, 'store_id']
        item = sales.loc[article, 'item_id']
   
        X_train = pd.merge(X_train_org, prices[prices['item_id']==item], on=merge_on, how='left')
    
        #features = ['wday', 'month', 'year', 'sell_price']
        features = ['wday', 'month', 'year', 'sell_price', 'snap_CA', 'snap_TX', 'snap_WI']
                    #'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
                    #'cat_id', 'state_id']
    
        X_train = X_train[features]
        X_train.fillna(X_train.mean(), inplace=True)
        y_train = sales[sales.columns[6+skip_days:]].iloc[article].values
        y_train = np.log1p(y_train)
    
        # scale data
        scaler.fit(X_train)
        X_train_scale = scaler.transform(X_train)
    
        # train model
        model.fit(X_train_scale[1+skip_days:1913+1], y_train)
        preds_val = model.predict(X_train_scale[1913+1:1941+1])
        #y_val = np.append(y_train, preds_val)
        #model.fit(X_train_scale[1:1941+1], y_val)
        #preds_eval = model.predict(X_train_scale[1941:1969+1])
        preds_eval = np.array([0 for i in range(28)]) 
        
        preds_val = np.expm1(preds_val)
        #preds_eval = np.expm1(preds_eval)
        results.append(preds_val)
        results.append(preds_eval)

    return results 
    



In [None]:
Prophet(uncertainty_samples=False,weekly_seasonality = True, yearly_seasonality = True,)

In [None]:
def run_prophet(timeserie):
    model = Prophet(uncertainty_samples=False,weekly_seasonality = True, yearly_seasonality = True,) # changed to add seasonality - will it wokr with numeric row extract? 
    # optional: add usa holidays
    model.add_country_holidays(country_name='US')
    
    model.fit(timeserie)
    future = model.make_future_dataframe(periods=28, include_history=False)
    forecast = model.predict(future)
    return forecast

In [None]:
# start_from_ob = 800 # orig
start_from_ob = 700
for i in trange(sales_train.shape[0]):
    temp_series = sales_train.iloc[i,start_from_ob + 6:]
    temp_series.index = calendar_df['date'][start_from_ob:start_from_ob+len(temp_series)]
    temp_series =  pd.DataFrame(temp_series)
    temp_series = temp_series.reset_index()
    temp_series.columns = ['ds', 'y']

    with Pool(cpu_count()) as p:
        forecast1 = p.map(run_prophet, [temp_series])

    submission.iloc[i,1:] = forecast1[0]['yhat'].values

submission.iloc[:,1:]=submission.iloc[:,1:].where(submission.iloc[:,1:] > 0).fillna(0)



In [None]:
submission.head()

In [None]:


submission.to_csv('submissio.csv', index=False)



In [None]:
stv = pd.read_csv('../input/kaggle-afc/sales_train_validation_afcs2020.csv')
cal = pd.read_csv('../input/kaggle-afc/calendar_afcs2020.csv')
sell_prices = pd.read_csv('../input/kaggle-afc/sell_prices_afcs2020.csv')
sample_output = pd.read_csv('../input/kaggle-afc/sample_submission_afcs2020.csv')


In [None]:
d_cols = [c for c in stv.columns if 'd_' in c]
all_data = stv[d_cols] \
    .sum(axis=0) \
    .T \
    .reset_index()

all_data.columns = ['d','sales']

In [None]:
all_data_merged = all_data.merge(cal, how='left', validate='1:1')
all_data_merged.head()

In [None]:
y = all_data_merged.set_index('date')['sales']

#Detect days that have either event_1 or event_2
places = all_data_merged.loc[~(all_data_merged['event_name_1'].isna()) | ~(all_data_merged['event_name_2'].isna())]['d']

change = list(all_data_merged.d.isin(list(places)))
for i in range(len(change)):
    if change[i] == True:
        y.iloc[i] = (y.iloc[i-1] + y.iloc[i+1]) / 2


In [None]:


y_month = y.reset_index()
y_month['date'] = pd.to_datetime(y_month['date'])
y_month = y_month.set_index('date')
y_month = y_month.resample('W').mean()


In [None]:


y_year = y.reset_index()
y_year['date'] = pd.to_datetime(y_year['date'])
y_year = y_year.set_index('date')
y_year = y_year.resample('M').mean()

In [None]:
y_sales = y.reset_index().drop(['date'],axis=1)

#Time scale
predic1 = range(1913)

#Applying the Fourier series to the time scale
predic_annual_cos = list(map(lambda x: math.cos(2*math.pi*x/365), predic1))
predic_annual_sin = list(map(lambda x: math.sin(2*math.pi*x/365), predic1))

predic_month_cos = list(map(lambda x: math.cos(2*math.pi*x/30), predic1))
predic_month_sin = list(map(lambda x: math.sin(2*math.pi*x/30), predic1))

predic_week_cos = list(map(lambda x: math.cos(2*math.pi*x/7), predic1))
predic_week_sin = list(map(lambda x: math.sin(2*math.pi*x/7), predic1))

#assembling the regressors
reg = pd.DataFrame(list(zip(predic1, predic_annual_cos, predic_annual_sin, predic_month_cos, predic_month_sin, predic_week_cos, predic_week_sin)), 
               columns =['predic1', 'predic_annual_cos', 'predic_annual_sin', 'predic_month_cos', 'predic_month_sin', 'predic_week_cos', 'predic_week_sin']) 

In [None]:
model = LinearRegression().fit(reg, y_sales)

#The estimated parameters
r2 = model.score(reg, y_sales)
print('coefficient of determination:', r2)

In [None]:
trend = model.intercept_ + model.coef_[0][0]*np.array(predic1)
seas_annual = model.coef_[0][1]*np.array(predic_annual_cos) + model.coef_[0][2]*np.array(predic_annual_sin)
seas_month = model.coef_[0][3]*np.array(predic_month_cos) + model.coef_[0][4]*np.array(predic_month_sin)
seas_week = model.coef_[0][5]*np.array(predic_week_cos) + model.coef_[0][6]*np.array(predic_week_sin)

trend_seas = trend + seas_annual + seas_month + seas_week

ax = pd.DataFrame(trend_seas, columns=['trend+seasonalities']).plot(figsize=(20,8))

In [None]:


y_adjusted = np.array(list(y_sales['sales'])) - trend_seas
y_adjusted = pd.DataFrame(y_adjusted, columns=['noise'])

In [None]:


y_train = y_adjusted.iloc[:-28,]
y_test = y_adjusted.iloc[-28:,]



In [None]:
# Define the p and q parameters to take any value between 0 and 4, d between 0 and 1
p = q = range(0, 6)
d = [0,1,2]

# Generate all different combinations of p, q and q triplets
pdq = list(itr.product(p, d, q))

# Generate all different combinations of seasonal p, q and q triplets
seasonal_pdq = [[0,0,0,0]]

warnings.filterwarnings("ignore") # specify to ignore warning messages
minimum = 500000 #initialize the minimum AIC variable with a high enough value
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y_train,
                                            order=param,
                                            seasonal_order=param_seasonal)

            results = mod.fit()
            
            if results.aic < minimum:
                minimum = results.aic
                param_ideal = param
                

            print('ARIMA{} - AIC:{}'.format(param, results.aic))
            
        except:
            print('none')
            continue
            
print('And the result is ARIMA{} - AIC:{}'.format(param_ideal, minimum))


In [None]:


mod = sm.tsa.statespace.SARIMAX(y_train,
                                order=(5, 0, 5),
                                seasonal_order=(0, 0, 0, 0))
results = mod.fit()

In [None]:
pred = results.get_forecast(steps=28)

ax = y_test.plot(figsize=(20, 10))

pd.DataFrame(pred.predicted_mean, columns=['forecast']).plot(ax=ax)

ax.fill_between(range(1885,1913), pred.conf_int()['lower noise'], pred.conf_int()['upper noise'], color='b', alpha=.04)

In [None]:


ax = pd.DataFrame(np.array(list(y_test['noise']))+trend_seas[1885:],index = range(1885,1913), columns=['sales']).plot(figsize=(20, 10))

pd.DataFrame(pred.predicted_mean+trend_seas[1885:], columns=['forecast']).plot(ax=ax)

ax.fill_between(range(1885,1913), pred.conf_int()['lower noise']+trend_seas[1885:], pred.conf_int()['upper noise']+trend_seas[1885:], color='b', alpha=.04)


In [None]:
pred_all = results.predict(start=0, end=1912)

In [None]:
pred_all

In [None]:
stv = pd.read_csv('../input/kaggle-afc/sales_train_validation_afcs2020.csv')
cal = pd.read_csv('../input/kaggle-afc/calendar_afcs2020.csv')
sell_prices = pd.read_csv('../input/kaggle-afc/sell_prices_afcs2020.csv')
ss = pd.read_csv('../input/kaggle-afc/sample_submission_afcs2020.csv')


In [None]:
last_28 = stv.iloc[:, pd.np.r_[0,-28:0]]
last_28.head()

In [None]:
last = last_28.melt('id', var_name='d', value_name='demand')
last.head()


In [None]:
last = last.merge(cal)
last.head()

In [None]:
by_weekday = last.groupby(['id','wday'])['demand'].mean()

In [None]:


# make a copy of the sample submission
sub = ss.copy()
# change the column names to match the last 28 days
sub.columns = ['id'] + ['d_' + str(1914+x) for x in range(28)]
# select only the rows with an id with the validation tag
sub = sub.loc[sub.id.str.contains('validation')]



In [None]:


# melt this dataframe and merge it with the calendar so we can join it with by_weekday dataframe
sub = sub.melt('id', var_name='d', value_name='demand')
sub = sub.merge(cal)[['id', 'd', 'wday']]
df = sub.join(by_weekday, on=['id', 'wday'])
df.head()



In [None]:


# pivot df to get it into the proper format for submission
df = df.pivot(index='id', columns='d', values='demand')
# need to reset index to take care of columns. comment next line out to see what i mean 
df.reset_index(inplace=True)
df.head()



In [None]:
df.shape

In [None]:


submission = ss[['id']].copy()



In [None]:


submission = submission.merge(df)



In [None]:
#submission = pd.concat([submission, submission], axis=0)

In [None]:
submission.shape

In [None]:
submission['id'] = ss.id.values


In [None]:
submission.columns = ['id'] + ['F' + str(i) for i in range(1,29)]
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)