In [21]:
# Data manipulation
# ==============================================================================
import pandas as pd
import numpy as np
from pathlib import Path

# DateTime
# ==============================================================================
import datetime as dt

# Plots
# ==============================================================================
import matplotlib 
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Sklearn
# ==============================================================================
from sklearn import preprocessing
from sklearn import model_selection
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Seaborn
# ==============================================================================
import seaborn as sns

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster

# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

### Load the dataset.

In [2]:
# Load the CSV file as a Pandas DataFrame and preview the DataFrame.
nordstrom_df = pd.read_csv('Resources/nordstrom_product_raw_data.csv')

# Print shape and info of DataFrame.
print(nordstrom_df.shape)
print(nordstrom_df.info())
                           
# Preview DataFrame.
nordstrom_df.head(10)

  nordstrom_df = pd.read_csv('Resources/nordstrom_product_raw_data.csv')


(288409, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288409 entries, 0 to 288408
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   DEPARTMENT        288409 non-null  object 
 1   CATEGORY          288409 non-null  object 
 2   SUBCATEGORY       288409 non-null  object 
 3   SKU               288409 non-null  int64  
 4   PRODUCT_NAME      288392 non-null  object 
 5   BRAND             288295 non-null  object 
 6   GENDER            288409 non-null  object 
 7   PRICE_RETAIL      288409 non-null  float64
 8   PRICE_CURRENT     288409 non-null  float64
 9   REVIEW_RATING     288409 non-null  float64
 10  REVIEW_COUNT      288409 non-null  int64  
 11  PROMOTION         16382 non-null   object 
 12  COLOR             288409 non-null  object 
 13  RunDate           288409 non-null  object 
 14  InsertUpdateTime  288409 non-null  object 
dtypes: float64(3), int64(2), object(10)
memory usage: 33.0+

Unnamed: 0,DEPARTMENT,CATEGORY,SUBCATEGORY,SKU,PRODUCT_NAME,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR,RunDate,InsertUpdateTime
0,Women,Clothing,"Blazers, Suits & Separates",4578258,Etiennette B Good Wool Suit Jacket,Theory,Women,345.0,345.0,4.1,85,,498,2022-05-15 08:03:21,2022-05-15 08:19:11
1,Women,Clothing,"Blazers, Suits & Separates",4954210,Demitria 2 Stretch Good Wool Suit Pants,Theory,Women,215.0,215.0,4.2,71,,20,2022-05-15 08:03:21,2022-05-15 08:19:11
2,Women,Clothing,"Blazers, Suits & Separates",5125568,Bermuda Shorts,Vince,Women,175.0,175.0,4.0,41,,1,2022-05-15 08:03:21,2022-05-15 08:19:11
3,Women,Clothing,"Blazers, Suits & Separates",5184553,Drape Collar Knit Blazer,Caslon®,Women,69.0,69.0,4.3,590,,401,2022-05-15 08:03:21,2022-05-15 08:19:11
4,Women,Clothing,"Blazers, Suits & Separates",5268077,Grace Jacket,NIC+ZOE,Women,148.0,148.0,3.9,51,,4,2022-05-15 08:03:21,2022-05-15 08:19:11
5,Women,Clothing,"Blazers, Suits & Separates",5323310,Pintuck Stretch Crepe Skinny Pants,Vince Camuto,Women,89.0,89.0,3.5,56,,6,2022-05-15 08:03:21,2022-05-15 08:19:11
6,Women,Clothing,"Blazers, Suits & Separates",5353634,Flare Ponte Pants,SPANX®,Women,148.0,148.0,3.8,19,,1,2022-05-15 08:03:21,2022-05-15 08:19:11
7,Women,Clothing,"Blazers, Suits & Separates",5436249,Nina Notched Collar Blazer,Vince Camuto,Women,129.0,129.0,4.0,30,,1,2022-05-15 08:03:21,2022-05-15 08:19:11
8,Women,Clothing,"Blazers, Suits & Separates",5727231,Everyday Cotton Blend Blazer,Nordstrom,Women,129.0,129.0,0.0,0,,1,2022-05-15 08:03:21,2022-05-15 08:19:11
9,Women,Clothing,"Blazers, Suits & Separates",5743270,Twill Blazer,Halogen®,Women,99.0,99.0,3.0,3,,1,2022-05-15 08:03:21,2022-05-15 08:19:11


In [3]:
# View duplicates in the DataFrame.
nordstrom_df.duplicated().sum()

72300

In [4]:
# View the rows that are duplicated.
nordstrom_df[nordstrom_df.duplicated()]

Unnamed: 0,DEPARTMENT,CATEGORY,SUBCATEGORY,SKU,PRODUCT_NAME,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR,RunDate,InsertUpdateTime
199538,Women,Maternity,Tops,5899844,Linen Button-Up Maternity Shirt,Angel Maternity,Women,69.95,69.95,0.0,0,,650,2022-05-15 08:03:21,2022-05-16 10:04:31
199539,Women,Maternity,Tops,5899844,Linen Button-Up Maternity Shirt,Angel Maternity,Women,69.95,69.95,0.0,0,,650,2022-05-15 08:03:21,2022-05-16 10:04:31
199540,Women,Maternity,Tops,5899844,Linen Button-Up Maternity Shirt,Angel Maternity,Women,69.95,69.95,0.0,0,,650,2022-05-15 08:03:21,2022-05-16 10:04:31
199541,Women,Maternity,Tops,5899844,Linen Button-Up Maternity Shirt,Angel Maternity,Women,69.95,69.95,0.0,0,,650,2022-05-15 08:03:21,2022-05-16 10:04:31
199542,Women,Maternity,Tops,5899844,Linen Button-Up Maternity Shirt,Angel Maternity,Women,69.95,69.95,0.0,0,,650,2022-05-15 08:03:21,2022-05-16 10:04:31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288396,Women,Clothing,Tops,6755174,Women's Green Michigan State Spartans Loud n P...,SPIRIT JERSEY,Women,69.99,69.99,0.0,0,,355,2022-05-15 08:03:21,2022-05-16 21:38:10
288398,Women,Clothing,Tops,6716488,Women's Soft as a Grape Royal Texas Rangers Mu...,SOFT AS A GRAPE,Women,27.99,27.99,0.0,0,,431,2022-05-15 08:03:21,2022-05-16 21:38:10
288399,Women,Clothing,Tops,6716488,Women's Soft as a Grape Royal Texas Rangers Mu...,SOFT AS A GRAPE,Women,27.99,27.99,0.0,0,,431,2022-05-15 08:03:21,2022-05-16 21:38:10
288403,Women,Clothing,Tops,6752852,Women's Pressbox Black Tennessee Volunteers Wi...,PRESSBOX,Women,31.99,31.99,0.0,0,,001,2022-05-15 08:03:21,2022-05-16 21:38:50


In [5]:
# Drop duplicate values with keeping the first iteration.
nordstrom_df = nordstrom_df.drop_duplicates()

#Verify.
nordstrom_df.duplicated().sum()

0

In [6]:
# For loop to find unique values in the DataFrame.
for i in nordstrom_df.columns:
    print(i, nordstrom_df[i].nunique())

DEPARTMENT 1
CATEGORY 7
SUBCATEGORY 81
SKU 129343
PRODUCT_NAME 114112
BRAND 2029
GENDER 1
PRICE_RETAIL 2127
PRICE_CURRENT 3900
REVIEW_RATING 42
REVIEW_COUNT 789
PROMOTION 75
COLOR 963
RunDate 1
InsertUpdateTime 9072


In [7]:
# Reorder columns. 
nordstrom_df = nordstrom_df[['RunDate','InsertUpdateTime','DEPARTMENT','CATEGORY','SUBCATEGORY','PRODUCT_NAME','SKU','BRAND','GENDER','PRICE_RETAIL','PRICE_CURRENT','REVIEW_RATING','REVIEW_COUNT','PROMOTION','COLOR']]

# Preview DataFrame.
nordstrom_df.head(10)

Unnamed: 0,RunDate,InsertUpdateTime,DEPARTMENT,CATEGORY,SUBCATEGORY,PRODUCT_NAME,SKU,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR
0,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Etiennette B Good Wool Suit Jacket,4578258,Theory,Women,345.0,345.0,4.1,85,,498
1,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Demitria 2 Stretch Good Wool Suit Pants,4954210,Theory,Women,215.0,215.0,4.2,71,,20
2,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Bermuda Shorts,5125568,Vince,Women,175.0,175.0,4.0,41,,1
3,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Drape Collar Knit Blazer,5184553,Caslon®,Women,69.0,69.0,4.3,590,,401
4,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Grace Jacket,5268077,NIC+ZOE,Women,148.0,148.0,3.9,51,,4
5,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Pintuck Stretch Crepe Skinny Pants,5323310,Vince Camuto,Women,89.0,89.0,3.5,56,,6
6,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Flare Ponte Pants,5353634,SPANX®,Women,148.0,148.0,3.8,19,,1
7,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Nina Notched Collar Blazer,5436249,Vince Camuto,Women,129.0,129.0,4.0,30,,1
8,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Everyday Cotton Blend Blazer,5727231,Nordstrom,Women,129.0,129.0,0.0,0,,1
9,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Twill Blazer,5743270,Halogen®,Women,99.0,99.0,3.0,3,,1


In [8]:
# Determine value counts on Promotion column.
nordstrom_df['PROMOTION'].value_counts()

40% off    4451
60% off    2260
30% off    1784
50% off    1364
20% off     715
           ... 
2% off        2
1% off        2
5% off        2
71% off       1
8% off        1
Name: PROMOTION, Length: 75, dtype: int64

In [9]:
# Removing characters "% off" from the Promotion column.
nordstrom_df['PROMOTION'] = nordstrom_df['PROMOTION'].str.replace("% off","")

# Preview DataFrame.
nordstrom_df.head(10)

Unnamed: 0,RunDate,InsertUpdateTime,DEPARTMENT,CATEGORY,SUBCATEGORY,PRODUCT_NAME,SKU,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR
0,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Etiennette B Good Wool Suit Jacket,4578258,Theory,Women,345.0,345.0,4.1,85,,498
1,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Demitria 2 Stretch Good Wool Suit Pants,4954210,Theory,Women,215.0,215.0,4.2,71,,20
2,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Bermuda Shorts,5125568,Vince,Women,175.0,175.0,4.0,41,,1
3,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Drape Collar Knit Blazer,5184553,Caslon®,Women,69.0,69.0,4.3,590,,401
4,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Grace Jacket,5268077,NIC+ZOE,Women,148.0,148.0,3.9,51,,4
5,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Pintuck Stretch Crepe Skinny Pants,5323310,Vince Camuto,Women,89.0,89.0,3.5,56,,6
6,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Flare Ponte Pants,5353634,SPANX®,Women,148.0,148.0,3.8,19,,1
7,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Nina Notched Collar Blazer,5436249,Vince Camuto,Women,129.0,129.0,4.0,30,,1
8,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Everyday Cotton Blend Blazer,5727231,Nordstrom,Women,129.0,129.0,0.0,0,,1
9,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Twill Blazer,5743270,Halogen®,Women,99.0,99.0,3.0,3,,1


In [10]:
# Determine value counts on Promotion column.
nordstrom_df['PROMOTION'].value_counts()

40    4451
60    2260
30    1784
50    1364
20     715
      ... 
2        2
1        2
5        2
71       1
8        1
Name: PROMOTION, Length: 75, dtype: int64

In [11]:
# Replace NaN Values with Zeros in the DataFrame.
nordstrom_df['PROMOTION'] = nordstrom_df['PROMOTION'].fillna(0)

# Preview DataFrame.
nordstrom_df.head(10)

Unnamed: 0,RunDate,InsertUpdateTime,DEPARTMENT,CATEGORY,SUBCATEGORY,PRODUCT_NAME,SKU,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR
0,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Etiennette B Good Wool Suit Jacket,4578258,Theory,Women,345.0,345.0,4.1,85,0,498
1,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Demitria 2 Stretch Good Wool Suit Pants,4954210,Theory,Women,215.0,215.0,4.2,71,0,20
2,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Bermuda Shorts,5125568,Vince,Women,175.0,175.0,4.0,41,0,1
3,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Drape Collar Knit Blazer,5184553,Caslon®,Women,69.0,69.0,4.3,590,0,401
4,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Grace Jacket,5268077,NIC+ZOE,Women,148.0,148.0,3.9,51,0,4
5,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Pintuck Stretch Crepe Skinny Pants,5323310,Vince Camuto,Women,89.0,89.0,3.5,56,0,6
6,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Flare Ponte Pants,5353634,SPANX®,Women,148.0,148.0,3.8,19,0,1
7,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Nina Notched Collar Blazer,5436249,Vince Camuto,Women,129.0,129.0,4.0,30,0,1
8,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Everyday Cotton Blend Blazer,5727231,Nordstrom,Women,129.0,129.0,0.0,0,0,1
9,2022-05-15 08:03:21,2022-05-15 08:19:11,Women,Clothing,"Blazers, Suits & Separates",Twill Blazer,5743270,Halogen®,Women,99.0,99.0,3.0,3,0,1


In [12]:
# Determine value counts on Promotion column.
nordstrom_df['PROMOTION'].value_counts()

# Print shape and info of DataFrame.
print(nordstrom_df.shape)
print(nordstrom_df.info())

(216109, 15)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 216109 entries, 0 to 288408
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   RunDate           216109 non-null  object 
 1   InsertUpdateTime  216109 non-null  object 
 2   DEPARTMENT        216109 non-null  object 
 3   CATEGORY          216109 non-null  object 
 4   SUBCATEGORY       216109 non-null  object 
 5   PRODUCT_NAME      216092 non-null  object 
 6   SKU               216109 non-null  int64  
 7   BRAND             216044 non-null  object 
 8   GENDER            216109 non-null  object 
 9   PRICE_RETAIL      216109 non-null  float64
 10  PRICE_CURRENT     216109 non-null  float64
 11  REVIEW_RATING     216109 non-null  float64
 12  REVIEW_COUNT      216109 non-null  int64  
 13  PROMOTION         216109 non-null  object 
 14  COLOR             216109 non-null  object 
dtypes: float64(3), int64(2), object(10)
memory usage: 26.4+

In [13]:
# Covert Promotion column from object to int.
nordstrom_df['PROMOTION'] = nordstrom_df['PROMOTION'].astype('int64')

# Divide Promotion column by 100 to convert the new int values as a percentage.
nordstrom_df['PROMOTION'] = nordstrom_df['PROMOTION']/100

In [14]:
# See if there any unique values in the SKU column.
nordstrom_df['SKU'].value_counts()

# See if there any unique values in the PRODUCT_NAME column.
nordstrom_df['PRODUCT_NAME'].value_counts()

# Print value counts of selected columns. 
print(nordstrom_df['SKU'].value_counts(),nordstrom_df['PRODUCT_NAME'].value_counts())

3091339    27
3395135    24
5886302    20
6775636    20
5228873    19
           ..
6806509     1
6806952     1
6834352     1
6837459     1
5092685     1
Name: SKU, Length: 129343, dtype: int64 Initial Pendant Necklace            57
Logo Slide Sandal                   54
Slide Sandal                        52
Cubic Zirconia Stud Earrings        52
Hoop Earrings                       50
                                    ..
Women's Yin Yang Cotton Shorts       1
Holmden Sweater Shorts               1
Maya High Waist Shorts               1
Aldi Belted High Waist Shorts        1
Qaravan Platform Slip-On Sneaker     1
Name: PRODUCT_NAME, Length: 114112, dtype: int64


In [15]:
# Drop RunDate and InsertUpdateTime columns. Since they are not unique, they will not be used for ML purposes or Time series forecasting.
nordstrom_df = nordstrom_df.drop(['RunDate', 'InsertUpdateTime'], axis=1)

# View updated DataFrame. 
nordstrom_df.head(10)

Unnamed: 0,DEPARTMENT,CATEGORY,SUBCATEGORY,PRODUCT_NAME,SKU,BRAND,GENDER,PRICE_RETAIL,PRICE_CURRENT,REVIEW_RATING,REVIEW_COUNT,PROMOTION,COLOR
0,Women,Clothing,"Blazers, Suits & Separates",Etiennette B Good Wool Suit Jacket,4578258,Theory,Women,345.0,345.0,4.1,85,0.0,498
1,Women,Clothing,"Blazers, Suits & Separates",Demitria 2 Stretch Good Wool Suit Pants,4954210,Theory,Women,215.0,215.0,4.2,71,0.0,20
2,Women,Clothing,"Blazers, Suits & Separates",Bermuda Shorts,5125568,Vince,Women,175.0,175.0,4.0,41,0.0,1
3,Women,Clothing,"Blazers, Suits & Separates",Drape Collar Knit Blazer,5184553,Caslon®,Women,69.0,69.0,4.3,590,0.0,401
4,Women,Clothing,"Blazers, Suits & Separates",Grace Jacket,5268077,NIC+ZOE,Women,148.0,148.0,3.9,51,0.0,4
5,Women,Clothing,"Blazers, Suits & Separates",Pintuck Stretch Crepe Skinny Pants,5323310,Vince Camuto,Women,89.0,89.0,3.5,56,0.0,6
6,Women,Clothing,"Blazers, Suits & Separates",Flare Ponte Pants,5353634,SPANX®,Women,148.0,148.0,3.8,19,0.0,1
7,Women,Clothing,"Blazers, Suits & Separates",Nina Notched Collar Blazer,5436249,Vince Camuto,Women,129.0,129.0,4.0,30,0.0,1
8,Women,Clothing,"Blazers, Suits & Separates",Everyday Cotton Blend Blazer,5727231,Nordstrom,Women,129.0,129.0,0.0,0,0.0,1
9,Women,Clothing,"Blazers, Suits & Separates",Twill Blazer,5743270,Halogen®,Women,99.0,99.0,3.0,3,0.0,1


In [16]:
# Rename columns using a mapping.
nordstrom_df.rename({'DEPARTMENT':'Department','CATEGORY':'Category','SUBCATEGORY':'Subcategory','PRODUCT_NAME':'Product Name'
          ,'SKU':'Stock-Keeping Unit (SKU)','BRAND':'Brand','GENDER':'Gender','PRICE_RETAIL':'Retail Price','PRICE_CURRENT':'Current Price'
          ,'REVIEW_RATING':'Review Rating','REVIEW_COUNT':'Review Count','PROMOTION':'Promotion','COLOR':'Color'}, axis=1, inplace=True)

# Preview updated DataFrame. 
nordstrom_df

Unnamed: 0,Department,Category,Subcategory,Product Name,Stock-Keeping Unit (SKU),Brand,Gender,Retail Price,Current Price,Review Rating,Review Count,Promotion,Color
0,Women,Clothing,"Blazers, Suits & Separates",Etiennette B Good Wool Suit Jacket,4578258,Theory,Women,345.00,345.00,4.1,85,0.0,498
1,Women,Clothing,"Blazers, Suits & Separates",Demitria 2 Stretch Good Wool Suit Pants,4954210,Theory,Women,215.00,215.00,4.2,71,0.0,020
2,Women,Clothing,"Blazers, Suits & Separates",Bermuda Shorts,5125568,Vince,Women,175.00,175.00,4.0,41,0.0,001
3,Women,Clothing,"Blazers, Suits & Separates",Drape Collar Knit Blazer,5184553,Caslon®,Women,69.00,69.00,4.3,590,0.0,401
4,Women,Clothing,"Blazers, Suits & Separates",Grace Jacket,5268077,NIC+ZOE,Women,148.00,148.00,3.9,51,0.0,004
...,...,...,...,...,...,...,...,...,...,...,...,...,...
288402,Women,Clothing,Tops,Women's Pressbox Black Tennessee Volunteers Wi...,6752852,PRESSBOX,Women,31.99,31.99,0.0,0,0.0,001
288404,Women,Clothing,Tops,Women's Pressbox Gray Tennessee Volunteers Spa...,6697914,PRESSBOX,Women,39.99,39.99,0.0,0,0.0,020
288405,Women,Clothing,Tops,Women's Pressbox Gray Tennessee Volunteers Spa...,6697914,PRESSBOX,Women,44.99,44.99,0.0,0,0.0,020
288406,Women,Clothing,Tops,Women's Red/White Maryland Terrapins Ombre Spi...,6700243,SPIRIT JERSEY,Women,64.99,64.99,0.0,0,0.0,600


In [17]:
# Descriptive statistics. 
# Ignore Stock-Keeping Unit (SKU) for it is a product identifer relating to the Product Name column.
nordstrom_df.describe()

Unnamed: 0,Stock-Keeping Unit (SKU),Retail Price,Current Price,Review Rating,Review Count,Promotion
count,216109.0,216109.0,216109.0,216109.0,216109.0,216109.0
mean,6247794.0,434.395834,426.630567,1.27658,10.526452,0.031459
std,666788.8,1123.460101,1121.328311,1.950997,128.954016,0.115203
min,2763447.0,5.0,3.5,0.0,0.0,0.0
25%,5871594.0,68.0,60.0,0.0,0.0,0.0
50%,6526636.0,129.95,124.0,0.0,0.0,0.0
75%,6731609.0,348.0,330.0,3.3,1.0,0.0
max,7023883.0,49850.0,49850.0,5.0,11766.0,0.8


In [18]:
# Import pandas interactive HTML report.
import pandas_profiling

nordstrom_profile = nordstrom_df.profile_report(title='Nordstrom Products Profiling Report')
nordstrom_profile.to_file(output_file='nordstrom_pandas_profiling_html')

visions.backends INFO  Pandas backend loaded 1.5.1
visions.backends INFO  Numpy backend loaded 1.21.5
visions.backends INFO  Pyspark backend NOT loaded
visions.backends INFO  Python backend loaded


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
# Saving cleaned file to csv for Tableau purposes.
nordstrom_df.to_csv('Resources/cleaned_nordstrom_data.csv')