In [1]:
# google drive 연결
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# 파일 경로 지정 및 확인
DATA_PATH = '/content/drive/MyDrive/2023_Yonsei_IT/kaggle/Competition1/store-sales-time-series-forecasting/'

In [3]:
# 패키지 불러오기

# 기본패키지
import numpy as np
import pandas as pd
import os
import gc
import warnings
import calendar
import datetime
from pandas import date_range

# 데이터 분석 패키지
import statsmodels.api as sm
from pathlib import Path
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import RegressorChain

# 시각화 패키지
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as offline
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 구글 코랩에서 learntools 실행
!git clone https://github.com/Kaggle/learntools.git
!mv learntools learntools_dir
!mv learntools_dir/learntools learntools
from learntools.deep_learning import decode_predictions
from learntools.time_series.style import *

# 세부설정
pd.set_option('display.max_columns', None) # 모든 열 이름을 표시
pd.options.display.float_format = '{:2f}'.format # 과학적 표기법 사용 안함
warnings.filterwarnings('ignore') # 경고메시지 비활성화

Cloning into 'learntools'...
remote: Enumerating objects: 18431, done.[K
remote: Counting objects: 100% (811/811), done.[K
remote: Compressing objects: 100% (422/422), done.[K
remote: Total 18431 (delta 461), reused 696 (delta 388), pack-reused 17620[K
Receiving objects: 100% (18431/18431), 105.10 MiB | 15.63 MiB/s, done.
Resolving deltas: 100% (13573/13573), done.


In [4]:
def sales_summury():
  fig=go.Figure()
  fig.add_trace(go.Scatter(
      x=[0, 1, 2, 3],
      y=[1.6, 1.6, 1.6, 1.6],
      mode="text", 
      text=["<span style='font-size:33px'><b>54</b></span>", 
            "<span style='font-size:33px'><b>33</b></span>",
            "<span style='font-size:33px'><b>16</b></span>",
            "<span style='font-size:33px'><b>56</b></span>"],
      textposition="bottom center"
  ))
  fig.add_trace(go.Scatter(
      x=[0, 1, 2, 3],
      y=[1.1, 1.1, 1.1, 1.1],
      mode="text", 
      text=["Stores", "Products", "States", "Months"],
      textposition="bottom center"
  ))
  fig.add_hline(y=2.2, line_width=5, line_color='gray')
  fig.add_hline(y=0.3, line_width=3, line_color='gray')
  fig.update_yaxes(visible=False)
  fig.update_xaxes(visible=False)
  fig.update_layout(showlegend=False, height=300, width=700, 
                    title='Store Sales Summary', title_x=0.5, title_y=0.9,
                    xaxis_range=[-0.5,3.6], yaxis_range=[-0.2,2.2],
                    plot_bgcolor='#fafafa', paper_bgcolor='#fafafa',
                    font=dict(size=23, color='#323232'),
                    title_font=dict(size=35, color='#222'),
                    margin=dict(t=90,l=70,b=0,r=70), 
      )
  return fig.show()

In [5]:
sales_summury()

In [6]:
# 데이터 불러오기
train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
stores = pd.read_csv(DATA_PATH + 'stores.csv')
oil = pd.read_csv(DATA_PATH + 'oil.csv')
holidays = pd.read_csv(DATA_PATH + 'holidays_events.csv')
transactions = pd.read_csv(DATA_PATH + 'transactions.csv').sort_values(["store_nbr", "date"])

In [7]:
# copying of train data and merging other data
df_train = train.merge(holidays, on = 'date', how='left')
df_train = df_train.merge(oil, on = 'date', how='left')
df_train = df_train.merge(stores, on = 'store_nbr', how='left')
df_train = df_train.merge(transactions, on = ['date', 'store_nbr'], how='left')
df_train = df_train.rename(columns = {"type_x" : "holiday_type", "type_y" : "store_type"})

df_train['date'] = pd.to_datetime(df_train['date'])
df_train['year'] = df_train['date'].dt.year
df_train['month'] = df_train['date'].dt.month
df_train['week'] = df_train['date'].dt.isocalendar().week
df_train['quarter'] = df_train['date'].dt.quarter
df_train['day_of_week'] = df_train['date'].dt.day_name()

In [9]:
# data
df_m_sa = df_train.groupby('month').agg({"sales" : "mean"}).reset_index()
df_m_sa['sales'] = round(df_m_sa['sales'],2)
df_m_sa['month_text'] = df_m_sa['month'].apply(lambda x: calendar.month_abbr[x])
df_m_sa['text'] = df_m_sa['month_text'] + ' - ' + df_m_sa['sales'].astype(str) 

df_w_sa = df_train.groupby('week').agg({"sales" : "mean"}).reset_index() 
df_q_sa = df_train.groupby('quarter').agg({"sales" : "mean"}).reset_index() 
# chart color
df_m_sa['color'] = '#496595'
df_m_sa['color'][:-1] = '#c6ccd8'
df_w_sa['color'] = '#c6ccd8'

# chart
fig = make_subplots(rows=2, cols=2, vertical_spacing=0.08,
                    row_heights=[0.7, 0.3], 
                    specs=[[{"type": "bar"}, {"type": "pie"}],
                           [{"colspan": 2}, None]],
                    column_widths=[0.7, 0.3],
                    subplot_titles=("Month wise Avg Sales Analysis", "Quarter wise Avg Sales Analysis", 
                                    "Week wise Avg Sales Analysis"))

fig.add_trace(go.Bar(x=df_m_sa['sales'], y=df_m_sa['month'], marker=dict(color= df_m_sa['color']),
                     text=df_m_sa['text'],textposition='auto',
                     name='Month', orientation='h'), 
                     row=1, col=1)
fig.add_trace(go.Pie(values=df_q_sa['sales'], labels=df_q_sa['quarter'], name='Quarter',
                     marker=dict(colors=['#334668','#496595','#6D83AA','#91A2BF','#C8D0DF']), hole=0.7,
                     hoverinfo='label+percent+value', textinfo='label+percent'), 
                     row=1, col=2)
fig.add_trace(go.Scatter(x=df_w_sa['week'], y=df_w_sa['sales'], mode='lines+markers', fill='tozeroy', fillcolor='#c6ccd8',
                     marker=dict(color= '#496595'), name='Week'), 
                     row=2, col=1)

# styling
fig.update_yaxes(visible=False, row=1, col=1)
fig.update_xaxes(visible=False, row=1, col=1)
fig.update_xaxes(tickmode = 'array', tickvals=df_w_sa.week, ticktext=[i for i in range(1,53)], 
                 row=2, col=1)
fig.update_yaxes(visible=False, row=2, col=1)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [67]:
# Monthly
# data
df_m_sa = df_train.groupby('month').agg({"sales" : "mean"}).reset_index()
df_m_sa['sales'] = round(df_m_sa['sales'],2)
df_m_sa['month_text'] = df_m_sa['month'].apply(lambda x: calendar.month_abbr[x])
df_m_sa['text'] = df_m_sa['month_text'] + ' - ' + df_m_sa['sales'].astype(str) 

# chart color
df_m_sa['color'] = '#496595'
df_m_sa['color'][:-1] = '#c6ccd8'

# chart
fig = make_subplots(rows=1, cols=1, 
                    specs=[[{"type": "bar"}]],
                    subplot_titles="Monthly Avg Sales")

fig.add_trace(go.Bar(x=df_m_sa['sales'], y=df_m_sa['month_text'], marker=dict(color= df_m_sa['color']),
                     text=df_m_sa['text'],textposition='auto',
                     name='Month', orientation='h'))

# styling
fig.update_yaxes(visible=True)
fig.update_xaxes(visible=True)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [82]:
# Week
# data
df_w_sa = df_train.groupby('day_of_week').agg({"sales" : "mean"}).reset_index()
df_w_sa['sales'] = round(df_w_sa['sales'],2)
df_w_sa['week_text'] = df_w_sa['day_of_week']
df_w_sa['text'] = df_w_sa['week_text'] + ' - ' + df_w_sa['sales'].astype(str) 

# chart color
df_w_sa['color'] = '#c6ccd8'
df_w_sa['color'][3:-3] =  '#496595'

# chart
fig = make_subplots(rows=1, cols=1, 
                    specs=[[{"type": "bar"}]],
                    subplot_titles="Weekly Avg Sales")

fig.add_trace(go.Bar(x=df_w_sa['sales'], y=df_w_sa['week_text'], marker=dict(color= df_w_sa['color']),
                     text=df_w_sa['text'],textposition='auto',
                     name='day_of_week', orientation='h'))

# styling
fig.update_yaxes(visible=True)
fig.update_xaxes(visible=True)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [66]:
# Quarter
# data
df_q_sa = df_train.groupby('quarter').agg({"sales" : "mean"}).reset_index()
df_q_sa['sales'] = round(df_q_sa['sales'],2)
df_q_sa['text'] ='Q' + df_q_sa['quarter'].astype(str)  + ' - ' + df_q_sa['sales'].astype(str) 

# chart color
df_q_sa['color'] = '#496595'
df_q_sa['color'][:-1] = '#c6ccd8'

# chart
fig = make_subplots(rows=1, cols=1, 
                    specs=[[{"type": "bar"}]],
                    subplot_titles="Quarter Avg Sales")

fig.add_trace(go.Bar(x=df_q_sa['sales'], y=df_q_sa['quarter'], marker=dict(color= df_q_sa['color']),
                     text=df_q_sa['text'],textposition='auto',
                     name='quarter', orientation='h'))

# styling
fig.update_yaxes(visible=True, dtick=1)
fig.update_xaxes(visible=True)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [102]:
# Store
# data
df_s_sa = df_train.groupby('store_nbr').agg({"sales" : "mean"}).reset_index()
df_s_sa['sales'] = round(df_s_sa['sales'],2)
df_s_sa['text'] ='Store NO.' + df_s_sa['store_nbr'].astype(str)  + ' - ' + df_s_sa['sales'].astype(str) 

# chart color
df_s_sa['color'] = '#c6ccd8'
df_s_sa['color'][43:44] = '#496595'

# chart
fig = make_subplots(rows=1, cols=1, 
                    specs=[[{"type": "bar"}]],
                    subplot_titles="Store Avg Sales")

fig.add_trace(go.Bar(x=df_s_sa['sales'], y=df_s_sa['store_nbr'], marker=dict(color= df_s_sa['color']),
                     text=df_s_sa['text'],textposition='auto',
                     name='store_nbr', orientation='h'))

# styling
fig.update_yaxes(visible=True, dtick=1)
fig.update_xaxes(visible=True)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()

In [101]:
# city
# data
df_s_sa = df_train.groupby('city').agg({"sales" : "mean"}).reset_index()
df_s_sa['sales'] = round(df_s_sa['sales'],2)
df_s_sa['text'] =df_s_sa['city'].astype(str)  + ' - ' + df_s_sa['sales'].astype(str) 

# chart color
df_s_sa['color'] = '#c6ccd8'
df_s_sa['color'][18:19] = '#496595'

# chart
fig = make_subplots(rows=1, cols=1, 
                    specs=[[{"type": "bar"}]],
                    subplot_titles="Store Avg Sales")

fig.add_trace(go.Bar(x=df_s_sa['sales'], y=df_s_sa['city'], marker=dict(color= df_s_sa['color']),
                     text=df_s_sa['text'],textposition='auto',
                     name='city', orientation='h'))

# styling
fig.update_yaxes(visible=True)
fig.update_xaxes(visible=True)
fig.update_layout(height=750, bargap=0.15,
                  margin=dict(b=0,r=20,l=20), 
                  title_text="Average Sales Analysis",
                  template="plotly_white",
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#f2f2f2", font_size=13, font_family="Lato, sans-serif"),
                  showlegend=False)
fig.show()