In [2]:
import warnings
warnings.filterwarnings("ignore")

# loading packages
# basic + dates 
import numpy as np
import pandas as pd
from pandas import datetime


# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline

# statistics
from statsmodels.distributions.empirical_distribution import ECDF

#SKlearn Modelling and data transform

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor


# time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# prophet by Facebook
# from fbprophet import Prophet

In [None]:
https://www.kaggle.com/cast42/xgboost-in-python-with-rmspe-v2
https://www.kaggle.com/elenapetrova/time-series-analysis-and-forecasts-with-prophet

In [None]:
train = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv", low_memory=False)
test = pd.read_csv("/kaggle/input/rossmann-store-sales/test.csv", low_memory=False)
store = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv", low_memory=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Join store data with Train and Test 

In [None]:
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

In [None]:
print("Data types in Train Column\n")
print(train.dtypes)
print("\nInformation on Train Column\n")
print(train.info())
print("\nSum of Null values in Train Column\n")
print(train.isnull().sum())

Data types in Train Column

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

Information on Train Column

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1

In [None]:
print("First five rows of our train data\n")
print(train.head())
print("\nStatistical description for train numerical data\n")
print(train.describe())

First five rows of our train data

   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263        555     1      1            0   
1      1          4  2015-07-30   5020        546     1      1            0   
2      1          3  2015-07-29   4782        523     1      1            0   
3      1          2  2015-07-28   5011        560     1      1            0   
4      1          1  2015-07-27   6102        612     1      1            0   

   SchoolHoliday StoreType Assortment  CompetitionDistance  \
0              1         c          a               1270.0   
1              1         c          a               1270.0   
2              1         c          a               1270.0   
3              1         c          a               1270.0   
4              1         c          a               1270.0   

   CompetitionOpenSinceMonth  CompetitionOpenSinceYear  Promo2  \
0                        9.0                    200

In [None]:
def create_feature(dataset):
    """
    The create feature function would help create features using the date
    column and also other features 
    """
    
    #i would map some categorical variable to numerical if
    #that variable present it is replaced
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    dataset.StoreType.replace(mappings, inplace=True)
    dataset.Assortment.replace(mappings, inplace=True)
    dataset.StateHoliday.replace(mappings, inplace=True)

    #converting some categorical variables to object
    categ =['DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday']
    for i in categ:
        dataset = dataset.astype({i:'object'})
    
    
    #convert date column to datetime
    dataset['Date']= pd.to_datetime(dataset.Date)

    #Feature creation
    dataset['Year'] = dataset.Date.dt.year
    dataset['Month'] = dataset.Date.dt.month
    dataset['Day'] = dataset.Date.dt.day
    dataset['DayOfWeek'] = dataset.Date.dt.dayofweek
    dataset['WeekOfYear'] = dataset.Date.dt.weekofyear
    dataset = dataset.set_index('Date')
    dataset = dataset.sort_index()
    return dataset


In [None]:
train = create_feature(train)

In [None]:
X = train.drop('')

In [None]:
train.dtypes

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1115,1,0,0,0,0,1,1,4,3,...,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",2013,1,1,1
2013-01-01,746,1,0,0,0,0,1,1,4,3,...,2.0,2011.0,1,35.0,2011.0,"Mar,Jun,Sept,Dec",2013,1,1,1
2013-01-01,171,1,0,0,0,0,1,1,1,1,...,,,0,,,,2013,1,1,1
2013-01-01,694,1,0,0,0,0,1,1,1,3,...,11.0,2012.0,1,40.0,2014.0,"Jan,Apr,Jul,Oct",2013,1,1,1
2013-01-01,396,1,0,0,0,0,1,1,1,3,...,,,0,,,,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-07-31,612,4,8161,493,1,1,0,0,4,3,...,11.0,2012.0,1,31.0,2009.0,"Jan,Apr,Jul,Oct",2015,7,31,31
2015-07-31,235,4,6756,548,1,1,0,1,1,1,...,3.0,2012.0,1,37.0,2009.0,"Jan,Apr,Jul,Oct",2015,7,31,31
2015-07-31,1078,4,9732,646,1,1,0,1,4,3,...,,,1,40.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31
2015-07-31,845,4,5151,378,1,1,0,1,4,1,...,11.0,2005.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31


In [None]:
def preprocess(dataset, label):
    
    """
    The preprocess function takes as primary argument the data 
    and peform the following stepwise transformations to it:
    
    1. impute missing values of numerical and categorical columns 
    using median and constant values respectively
    
    2. scales dataset using the RobustScaler (robust to outlier values present in this dataset)
    
    3. Encodes categorical values to numerical values
    """
    
    # split data to numerical
    numeric_features = dataset.select_dtypes(include=[
        'int64', 'float64']).columns
    
    for n in numeric_feature:
        dataset = dataset.astype({n:'float64'})
        
    
    
    #split data to categories
    categorical_features = dataset.select_dtypes(include=[
        'object']).columns
    
    

    
    # build pipeline to proprocess
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
        ('scaler', StandardScaler())])
    # for categorical variable create new category called missing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant',
                                  fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    #transform columns we do not need to use linear transformation for 
    #this since we are feeding this to random forest
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    rf = Pipeline([
        ('preprocess', preprocessor),
        ('classifier', RandomForestRegressor(random_state=42))
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-12-afad1701a89b>, line 72)

In [None]:
preprocessor= preprocess(train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Store', 'Sales', 'Customers', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 I

In [None]:
train.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                   int64
SchoolHoliday                  int64
StoreType                      int64
Assortment                     int64
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object