In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
stores = pd.read_csv('data/stores.csv')
feature = pd.read_csv('data/features.csv')
sample = pd.read_csv('data/sampleSubmission.csv')

In [4]:
feature['Date'] = pd.to_datetime(feature['Date'])
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
test.insert(3, 'Weekly_Sales', 0)
mf = feature.merge(stores, how='inner', on=['Store'])

In [5]:
concated = pd.concat([train, test])

In [7]:
#merged = pd.concat([train, test], axis=0)
merged = concated.copy()
merged = merged.sort_values(by=['Store','Dept','Date'])
merged = pd.merge(merged, mf, how='inner', on=['Date','Store','IsHoliday'])
merged['week'] = merged['Date'].dt.week
merged['month'] = merged['Date'].dt.month
merged['year'] = merged['Date'].dt.year
merged['int_week'] = merged['Date'].dt.week
merged['int_week'].loc[merged[merged['year']==2011].index] += 52
merged['int_week'].loc[merged[merged['year']==2012].index] += 52+52

In [9]:
len(merged), len(concated)

(536634, 536634)

In [11]:
train_df = merged.drop(columns=['IsHoliday','Store','Dept','week','month','Type','Date','year',
                          'int_week','MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5'])

In [12]:
hvec = pd.get_dummies(merged['IsHoliday'], prefix='h')
svec = pd.get_dummies(merged['Store'], prefix='s')
dvec = pd.get_dummies(merged['Dept'], prefix='d')
wvec = pd.get_dummies(merged['week'], prefix='w')
mvec = pd.get_dummies(merged['month'], prefix='m')
tvec = pd.get_dummies(merged['Type'], prefix='t')
train_df = pd.concat([train_df, hvec, svec, dvec, wvec, mvec, tvec], axis=1)

In [53]:
test_sample = train_df.drop(index=train.index).reset_index(drop=True)

In [20]:
### xgboost 

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
from xgboost import XGBRegressor

In [25]:
X = train_df.iloc[train.index].drop(columns=['Weekly_Sales'])
y = train_df.iloc[train.index]['Weekly_Sales']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
xgb1 = XGBRegressor(n_estimators=500, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb1.fit(X_train, y_train)
preds = xgb1.predict(X_train)

In [33]:
w = (X_train.h_True * 4 + 1).values
score = np.dot(w, np.abs(preds - y_train.values)) / w.sum()
print ("WMAE Value For XGBoost: ", score)

WMAE Value For XGBoost:  3217.820304103983


In [34]:
impordf = pd.DataFrame([[col, impo] for impo, col in zip(xgb1.feature_importances_.tolist(), X_train.columns)])

In [35]:
impordf.sort_values(1, ascending=False).head()

Unnamed: 0,0,1
2,CPI,0.216223
0,Temperature,0.119513
4,Size,0.109844
3,Unemployment,0.09271
1,Fuel_Price,0.079084


In [54]:
Xt = test_sample.drop(columns=['Weekly_Sales'])

In [55]:
yhat = xgb1.predict(Xt)

In [56]:
test_sample['Weekly_Sales'] = pd.Series(yhat)

In [57]:
def make_submission(x):
    return '{}_{}_{}'.format(x['Store'], x['Dept'], x['Date'].strftime('%Y-%m-%d'))

In [70]:
test_sample['Date'] = test['Date']
test_sample['Store'] = test['Store']
test_sample['Dept'] = test['Dept']

In [71]:
result = test_sample.apply(make_submission, axis=1)

In [74]:
result = pd.concat([result, test_sample['Weekly_Sales']], axis=1).rename(columns={0:'id'})

In [75]:
result.to_csv('data/sub.csv', index=False)

- Markdown1~5 데이터는 2011년 11월 4일 이전으로는 모두 null값으로 분석에서 제외했다
- Store, Dept, Type, IsHoliday, week, month를 onehotvector로 변경하여 새로운 변수를 추가했다
- Xgboost를 사용하여 기본적으로 분석해봤으나 결과는 너무 좋지 않다

score = 23451.53163 <br>
rate = 680 / 691 

### lag를 추가하여 다시한번 regression을 시도한다
- lag 4, 12, 26, 52 를 추가

In [236]:
test.Date.unique()

array(['2012-11-02', '2012-11-09', '2012-11-16', '2012-11-23',
       '2012-11-30', '2012-12-07', '2012-12-14', '2012-12-21',
       '2012-12-28', '2013-01-04', '2013-01-11', '2013-01-18',
       '2013-01-25', '2013-02-01', '2013-02-08', '2013-02-15',
       '2013-02-22', '2013-03-01', '2013-03-08', '2013-03-15',
       '2013-03-22', '2013-03-29', '2013-04-05', '2013-04-12',
       '2013-04-19', '2013-04-26', '2013-05-03', '2013-05-10',
       '2013-05-17', '2013-05-24', '2013-05-31', '2013-06-07',
       '2013-06-14', '2013-06-21', '2013-06-28', '2013-07-05',
       '2013-07-12', '2013-07-19', '2013-07-26'], dtype=object)

In [295]:
lag4 = merged.copy()
lag4.int_week += 4

In [296]:
lag4 = lag4.drop(columns=['IsHoliday','week','month','Type','Date','year',
                          'Size','MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5'])

In [297]:
lag4 = lag4.rename(columns={ c:'{}_p4'.format(c) for c in lag4.columns if c not in ['Store','Dept','int_week']})

In [298]:
lag4.head()

Unnamed: 0,Store,Dept,Weekly_Sales_p4,Temperature_p4,Fuel_Price_p4,CPI_p4,Unemployment_p4,int_week
0,1,1,24924.5,42.31,2.572,211.096358,8.106,9
1,1,2,50605.27,42.31,2.572,211.096358,8.106,9
2,1,3,13740.12,42.31,2.572,211.096358,8.106,9
3,1,4,39954.04,42.31,2.572,211.096358,8.106,9
4,1,5,32229.38,42.31,2.572,211.096358,8.106,9


In [299]:
df = merged.drop(columns=['IsHoliday','week','month','Type','Date','year',
                          'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5'])
hvec = pd.get_dummies(merged['IsHoliday'], prefix='h')
svec = pd.get_dummies(merged['Store'], prefix='s')
dvec = pd.get_dummies(merged['Dept'], prefix='d')
wvec = pd.get_dummies(merged['week'], prefix='w')
mvec = pd.get_dummies(merged['month'], prefix='m')
tvec = pd.get_dummies(merged['Type'], prefix='t')

In [300]:
tdf = pd.concat([df, hvec, svec, dvec, wvec, mvec, tvec], axis=1)

In [305]:
mlag = tdf.merge(lag4, how='inner', on=['int_week','Store','Dept'])

In [307]:
X = mlag.drop(columns=['Weekly_Sales'])
y = mlag['Weekly_Sales']

In [308]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [309]:
xgb = XGBRegressor(n_estimators=500, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_train)

In [232]:
w = (X_train.h_True * 4 + 1).values
score = np.dot(w, np.abs(preds - y_train.values)) / w.sum()
print ("WMAE Value For XGBoost: ", score)

WMAE Value For XGBoost:  2480.848035418852


In [313]:
impordf = pd.DataFrame([[col, impo] for impo, col in zip(xgb.feature_importances_.tolist(), X_train.columns)])

In [320]:
impordf.sort_values(1, ascending=False).head()

Unnamed: 0,0,1
203,Weekly_Sales_p4,0.17774
2,Temperature,0.085091
0,Store,0.056884
1,Dept,0.053179
3,Fuel_Price,0.047006


- lag4의 중요도가 가장 높다

In [319]:
merged_t

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,week,month,year,int_week
0,1,1,2012-11-02,21467.876953,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
1,1,2,2012-11-02,42491.878906,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
2,1,3,2012-11-02,10866.918945,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
3,1,4,2012-11-02,34289.167969,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
4,1,5,2012-11-02,22878.363281,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
5,1,6,2012-11-02,6838.884766,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
6,1,7,2012-11-02,20804.560547,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
7,1,8,2012-11-02,32551.882812,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
8,1,9,2012-11-02,20209.263672,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148
9,1,10,2012-11-02,27872.023438,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315,44,11,2012,148


In [None]:
merged_t['Weekly_Sales'] = pd.Series(yhat)