In [4]:
import pandas as pd

### 1. Data Loading

In [34]:
train_df = pd.read_csv('../data/raw/train.csv',parse_dates=['date'])
holiday_df = pd.read_csv('../data/raw/holidays_events.csv',parse_dates=['date'])
oil_df = pd.read_csv('../data/raw/oil.csv',parse_dates=['date'])
stores_df = pd.read_csv('../data/raw/stores.csv')
transactions_df = pd.read_csv('../data/raw/transactions.csv',parse_dates=['date'])
test_df = pd.read_csv('../data/raw/test.csv',parse_dates=['date'])

In [6]:
print("=== DATASET OVERVIEW ===")
print(f"Train dataset shape: {train_df.shape}")
print(f"Holiday dataset shape: {holiday_df.shape}")
print(f"Oil dataset shape: {oil_df.shape}")
print(f"Stores dataset shape: {stores_df.shape}")
print(f"Transactions dataset shape: {transactions_df.shape}")

=== DATASET OVERVIEW ===
Train dataset shape: (3000888, 6)
Holiday dataset shape: (350, 6)
Oil dataset shape: (1218, 2)
Stores dataset shape: (54, 5)
Transactions dataset shape: (83488, 3)


#### 1.1 train

In [7]:
# check for data types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [8]:
# TODO: sales range is very large, check for outliers
train_df.describe()

Unnamed: 0,id,date,store_nbr,sales,onpromotion
count,3000888.0,3000888,3000888.0,3000888.0,3000888.0
mean,1500444.0,2015-04-24 08:27:04.703088384,27.5,357.7757,2.60277
min,0.0,2013-01-01 00:00:00,1.0,0.0,0.0
25%,750221.8,2014-02-26 18:00:00,14.0,0.0,0.0
50%,1500444.0,2015-04-24 12:00:00,27.5,11.0,0.0
75%,2250665.0,2016-06-19 06:00:00,41.0,195.8473,0.0
max,3000887.0,2017-08-15 00:00:00,54.0,124717.0,741.0
std,866281.9,,15.58579,1101.998,12.21888


In [9]:
# get top 5 rows of max sales and top 5 rows of min sales
print("\n=== TOP 5 MAX SALES ===")
print(train_df.nlargest(5, 'sales'))
print("\n=== TOP 5 MIN SALES ===")
print(train_df.nsmallest(5, 'sales'))


=== TOP 5 MAX SALES ===
              id       date  store_nbr     family       sales  onpromotion
2163723  2163723 2016-05-02          2  GROCERY I  124717.000           59
2445984  2445984 2016-10-07         39      MEATS   89576.360            0
2144154  2144154 2016-04-21         20  GROCERY I   87438.516           53
2139699  2139699 2016-04-18         45  GROCERY I   76090.000           38
2153031  2153031 2016-04-26          2  GROCERY I   63434.000           30

=== TOP 5 MIN SALES ===
   id       date  store_nbr      family  sales  onpromotion
0   0 2013-01-01          1  AUTOMOTIVE    0.0            0
1   1 2013-01-01          1   BABY CARE    0.0            0
2   2 2013-01-01          1      BEAUTY    0.0            0
3   3 2013-01-01          1   BEVERAGES    0.0            0
4   4 2013-01-01          1       BOOKS    0.0            0


In [10]:
# check for duplicates
train_df.duplicated().sum()

np.int64(0)

In [11]:
# check for missing values
print("Missing values in train_df:\n", train_df.isnull().sum())

Missing values in train_df:
 id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64


In [12]:
# check for number of unique values
print("Unique values in train_df:\n", train_df.nunique())

Unique values in train_df:
 id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dtype: int64


In [13]:
# check for negative values in 'sales'
print("Negative values in 'sales':", (train_df['sales'] < 0).sum())

Negative values in 'sales': 0


In [14]:
print(f"Date range: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"Total days: {(train_df['date'].max() - train_df['date'].min()).days}")
print(f"Years covered: {sorted(train_df['date'].dt.year.unique())}")

Date range: 2013-01-01 00:00:00 to 2017-08-15 00:00:00
Total days: 1687
Years covered: [np.int32(2013), np.int32(2014), np.int32(2015), np.int32(2016), np.int32(2017)]


In [30]:
# check all dates are present
# TODO: handle missing dates
all_dates = pd.date_range(start=train_df['date'].min(), end=train_df['date'].max())
missing_dates = all_dates.difference(train_df['date'])
if missing_dates.empty:
    print("All dates are present in the train dataset.")
else:
    print(f"Missing dates in train dataset: {missing_dates}")

Missing dates in train dataset: DatetimeIndex(['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'], dtype='datetime64[ns]', freq=None)


#### 1.2 holiday

In [15]:
# check for data types
holiday_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


In [16]:
holiday_df.describe()

Unnamed: 0,date
count,350
mean,2015-04-24 00:45:15.428571392
min,2012-03-02 00:00:00
25%,2013-12-23 06:00:00
50%,2015-06-08 00:00:00
75%,2016-07-03 00:00:00
max,2017-12-26 00:00:00


In [17]:
# check for duplicates
holiday_df.duplicated().sum()

np.int64(0)

In [18]:
# check for missing values
print("Missing values in holiday_df:\n", holiday_df.isnull().sum())

Missing values in holiday_df:
 date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64


In [19]:
# check for number of unique values
print("Unique values in holiday_df:\n", holiday_df.nunique())

Unique values in holiday_df:
 date           312
type             6
locale           3
locale_name     24
description    103
transferred      2
dtype: int64


#### 1.3 oil

In [20]:
# check dtypes
oil_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        1218 non-null   datetime64[ns]
 1   dcoilwtico  1175 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 19.2 KB


In [21]:
oil_df.describe()

Unnamed: 0,date,dcoilwtico
count,1218,1175.0
mean,2015-05-02 12:00:00,67.714366
min,2013-01-01 00:00:00,26.19
25%,2014-03-03 06:00:00,46.405
50%,2015-05-02 12:00:00,53.19
75%,2016-06-30 18:00:00,95.66
max,2017-08-31 00:00:00,110.62
std,,25.630476


In [22]:
# check for duplicates
oil_df.duplicated().sum()

np.int64(0)

In [23]:
# check for missing values
# TODO: handle missing values in oil_df
print("Missing values in oil_df:\n", oil_df.isnull().sum())

Missing values in oil_df:
 date           0
dcoilwtico    43
dtype: int64


In [24]:
# check for number of unique values
print("Unique values in oil_df:\n", oil_df.nunique())

Unique values in oil_df:
 date          1218
dcoilwtico     998
dtype: int64


In [32]:
# check all dates are present
# TODO: handle missing dates
all_dates = pd.date_range(start=oil_df['date'].min(), end=oil_df['date'].max())
missing_dates = all_dates.difference(oil_df['date'])
if missing_dates.empty:
    print("All dates are present in the train dataset.")
else:
    print(f"Missing dates in train dataset: {missing_dates}")

Missing dates in train dataset: DatetimeIndex(['2013-01-05', '2013-01-06', '2013-01-12', '2013-01-13',
               '2013-01-19', '2013-01-20', '2013-01-26', '2013-01-27',
               '2013-02-02', '2013-02-03',
               ...
               '2017-07-29', '2017-07-30', '2017-08-05', '2017-08-06',
               '2017-08-12', '2017-08-13', '2017-08-19', '2017-08-20',
               '2017-08-26', '2017-08-27'],
              dtype='datetime64[ns]', length=486, freq=None)


#### 1.4 stores

In [25]:
# check for data types
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   store_nbr  54 non-null     int64 
 1   city       54 non-null     object
 2   state      54 non-null     object
 3   type       54 non-null     object
 4   cluster    54 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.2+ KB


In [26]:
stores_df.describe()

Unnamed: 0,store_nbr,cluster
count,54.0,54.0
mean,27.5,8.481481
std,15.732133,4.693395
min,1.0,1.0
25%,14.25,4.0
50%,27.5,8.5
75%,40.75,13.0
max,54.0,17.0


In [27]:
# check for duplicates
stores_df.duplicated().sum()

np.int64(0)

In [28]:
# check for missing values
print("Missing values in stores_df:\n", stores_df.isnull().sum())

Missing values in stores_df:
 store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64


In [29]:
# check for number of unique values
print("Unique values in stores_df:\n", stores_df.nunique())

Unique values in stores_df:
 store_nbr    54
city         22
state        16
type          5
cluster      17
dtype: int64


#### 1.5 test

In [35]:
# check for data types
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromotion  28512 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 1.1+ MB


In [36]:
test_df.describe()

Unnamed: 0,id,date,store_nbr,onpromotion
count,28512.0,28512,28512.0,28512.0
mean,3015144.0,2017-08-23 12:00:00,27.5,6.965383
min,3000888.0,2017-08-16 00:00:00,1.0,0.0
25%,3008016.0,2017-08-19 18:00:00,14.0,0.0
50%,3015144.0,2017-08-23 12:00:00,27.5,0.0
75%,3022271.0,2017-08-27 06:00:00,41.0,6.0
max,3029399.0,2017-08-31 00:00:00,54.0,646.0
std,8230.85,,15.586057,20.683952


In [37]:
# check for duplicates
test_df.duplicated().sum()

np.int64(0)

In [38]:
# check for missing values
print("Missing values in stores_df:\n", test_df.isnull().sum())

Missing values in stores_df:
 id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64


In [39]:
# check for number of unique values
print("Unique values in stores_df:\n", test_df.nunique())

Unique values in stores_df:
 id             28512
date              16
store_nbr         54
family            33
onpromotion      212
dtype: int64


In [40]:
# check all dates are present
all_dates = pd.date_range(start=test_df['date'].min(), end=test_df['date'].max())
missing_dates = all_dates.difference(test_df['date'])
if missing_dates.empty:
    print("All dates are present in the train dataset.")
else:
    print(f"Missing dates in train dataset: {missing_dates}")

All dates are present in the train dataset.
