In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
submission.info()

In [None]:
submission.head()

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
test.info()
# test.head()

In [None]:
# plot monthly sales per store
dates = train['date'].apply(lambda x: x[:-3]).unique()
all_storesales = {}
for id in range(1,11):
    print('calculating store',id)
    all_storesales[id] = []
    for date in dates:
        storedata = train[(train['store'] == id) & (train['date'].apply(lambda x: x[:-3]) == date)]
        storesales = storedata['sales'].sum()
        all_storesales[id].append(storesales)

In [None]:
for id in range(1,11):
    plt.figure(figsize=(20,10))
    plt.plot(dates, all_storesales[id])
    plt.title('Store '+str(id)+' Sales', fontsize=30)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# plot total sales per item, per store
items = range(1,51)
all_itemsales = {}
for id in range(1,11):
    print('calculating store',id)
    all_itemsales[id] = []
    for item in items:
        itemdata = train[(train['store'] == id) & (train['item'] == item)]
        itemsales = itemdata['sales'].sum()
        all_itemsales[id].append(itemsales)

In [None]:
mean_item_sales = np.array(all_itemsales[1])
for id in range(2,11):
    mean_item_sales += np.array(all_itemsales[id])
mean_item_sales = np.divide(mean_item_sales, 10)

for id in range(1,11):
    plt.figure(figsize=(20,10))
    plt.bar(items, all_itemsales[id])
    plt.plot(items, mean_item_sales, color='red')
    plt.title('Store '+str(id)+' Sales By Item', fontsize=30)
    plt.xticks(items)
    plt.show()

# Basic Linear Projection

In [None]:
dates_projected = [ '2018-'+str(mo) if mo >= 10 else '2018-0'+str(mo) for mo in range(1,13) ]

all_storesales_projected = {}
for id in range(1,11):
    print('calculating store',id)
    all_storesales_projected[id] = []
    for month in range(1,13):
        month_pts = []
        for year in range(2013,2018):
            date = str(year)+'-'+str(month) if month >= 10 else str(year)+'-0'+str(month)
            # get num sales for same month from past years
            storedata = train[(train['store'] == id) & (train['date'].apply(lambda x: x[:-3]) == date)]
            storesales = storedata['sales'].sum()
            month_pts.append(storesales)
        # get projected point from month_pts list and append it to projected sales
        total_diff = 0
        for idx,mp in enumerate(month_pts[1:]):
            total_diff += mp - month_pts[idx-1]
        mean_diff = total_diff/(len(month_pts)-1)
        next_pt = month_pts[-1] + mean_diff
        all_storesales_projected[id].append(next_pt)

In [None]:
for id in range(1,11):
    plt.figure(figsize=(20,10))
    plt.plot(dates, all_storesales[id])
    plt.plot(dates_projected, all_storesales_projected[id], color='red')
    plt.title('Store '+str(id)+' Sales', fontsize=30)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# print(all_storesales_projected)
# print(all_itemsales)

In [None]:
days_in_month = { 1: 31, 2: 28.25, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31 }

predicted_sales = []
for idx,row in test.iterrows():
    month = int(row['date'].split('-')[1])
    id = row['store']
    item = row['item']
    
    total_month_sales_projected = all_storesales_projected[id][month-1]
    
    item_sales_fraction = float(all_itemsales[id][item-1]) / sum(all_itemsales[id])
    
    item_sales_projected = total_month_sales_projected*item_sales_fraction / days_in_month[month]
    predicted_sales.append(item_sales_projected)

submission['sales'] = predicted_sales
submission.to_csv('submission_basic.csv', index=False)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
train_rf = train.copy()

train_rf['year'] = train_rf['date'].apply(lambda x: int(x.split('-')[0]))
train_rf['month'] = train_rf['date'].apply(lambda x: int(x.split('-')[1]))
train_rf['day'] = train_rf['date'].apply(lambda x: int(x.split('-')[2]))

train_rf = train_rf.drop('date', axis=1)

In [None]:
model = RandomForestRegressor(n_estimators=100)
model.fit(train_rf.drop('sales',axis=1), train_rf['sales'])

In [None]:
test['year'] = test['date'].apply(lambda x: int(x.split('-')[0]))
test['month'] = test['date'].apply(lambda x: int(x.split('-')[1]))
test['day'] = test['date'].apply(lambda x: int(x.split('-')[2]))

test = test.drop(['date','id'], axis=1)

In [None]:
test.info()

In [None]:
pred = model.predict(test)
print(pred)

In [None]:
submission['sales'] = pred
submission.to_csv('submission_rf.csv', index=False)

# Linear Regression

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression

def custom_normalize(df):
    return sklearn.preprocessing.normalize(df, axis=0)

#one hot
store_dummies_df = pd.get_dummies(train_rf['store'])
store_dummies_df.columns = [ 'store'+str(col) for col in store_dummies_df.columns ]

item_dummies_df = pd.get_dummies(train_rf['item'])
item_dummies_df.columns = [ 'item'+str(col) for col in item_dummies_df.columns ]

train_lr = pd.concat([store_dummies_df, item_dummies_df, train_rf.drop(['store','item','sales'], axis=1)], axis=1)
# train_lr = custom_normalize(train_lr)

In [None]:
model = LinearRegression(normalize=True)
model.fit(train_lr, train_rf['sales'])
print(model.intercept_, model.coef_)

In [None]:
#one hot
store_dummies_df = pd.get_dummies(test['store'])
store_dummies_df.columns = [ 'store'+str(col) for col in store_dummies_df.columns ]

item_dummies_df = pd.get_dummies(test['item'])
item_dummies_df.columns = [ 'item'+str(col) for col in item_dummies_df.columns ]

test_lr = pd.concat([store_dummies_df, item_dummies_df, test.drop(['store','item'], axis=1)], axis=1)

# test_lr = custom_normalize(test_lr)

pred = model.predict(test_lr)

In [None]:
# print(test_norm)
print(pred)
print(min(pred),max(pred))
# train_rf
# train_norm = custom_normalize(train_rf)
# train_norm[:-10]
# print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# print(pred[:100])
submission['sales'] = pred
submission.to_csv('submission_lr.csv', index=False)

In [None]:
# standardize not normalize
import sklearn

In [None]:
sklearn.__version__