# Import all packages/library.

In [159]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing

Read the raw data file.

In [160]:
raw_df = pd.read_csv('./data/retail_sales_synthetic.csv')
df = raw_df.copy()  # Copy to ensure every change made in this code doesn't affect the raw data.

In [161]:
# It will be truncated if we printed it as df.head() or df.describe() so we can't check (see) all columns.
# In order to avoid that, we need to print it partially.
n_col = len(df.columns)

In [162]:
print('\nInitial Dataframe Head:')
print(df.head().iloc[:, :int(n_col/2)])
print(df.head().iloc[:, int(n_col/2):])


Initial Dataframe Head:
         date  store_id store_type region    city  store_area_sqft product_id  \
0  2024-09-05  store_09          C   East  city_9             2287   prod_031   
1  2022-10-24  store_02          C   East  city_2             2627   prod_041   
2  2023-04-19  store_06          B   West  city_6             2547   prod_022   
3  2024-06-22  store_06          B   West  city_6             2547   prod_037   
4  2024-07-20  store_02          C   East  city_2             2627   prod_018   

   category  base_price  final_price  discount_pct  
0      Home       43.01        43.33           0.0  
1    Sports      121.31        86.51          30.0  
2  Clothing       11.10        11.05           0.0  
3    Beauty      272.28       284.06           0.0  
4      Home       56.11        56.58           0.0  
   promotion  is_holiday  day_of_week  weekend  units_sold  returns  \
0          0           0            3        0           8        1   
1          0           1    

In [163]:
# Check the raw data information before preprocessing.
print('\nInitial Dataframe Information:')
print(df.info())


Initial Dataframe Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164400 entries, 0 to 164399
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             164400 non-null  object 
 1   store_id         164400 non-null  object 
 2   store_type       164400 non-null  object 
 3   region           164400 non-null  object 
 4   city             164400 non-null  object 
 5   store_area_sqft  164400 non-null  int64  
 6   product_id       164400 non-null  object 
 7   category         164400 non-null  object 
 8   base_price       164400 non-null  float64
 9   final_price      164400 non-null  float64
 10  discount_pct     164400 non-null  float64
 11  promotion        164400 non-null  int64  
 12  is_holiday       164400 non-null  int64  
 13  day_of_week      164400 non-null  int64  
 14  weekend          164400 non-null  int64  
 15  units_sold       164400 non-null  int64  
 16  return

In [164]:
# Check the current data numeric stats before preprocessing.
print('\nInitial Dataframe Numeric Stats:')
print(df.describe().iloc[:, :int(n_col/3)])
print(df.describe().iloc[:, int(n_col/3):])


Initial Dataframe Numeric Stats:
       store_area_sqft     base_price    final_price   discount_pct  \
count    164400.000000  164400.000000  164400.000000  164400.000000   
mean       2440.700000      48.682199      46.781269       3.884519   
std         317.638196      49.139032      47.473471       7.073051   
min        1734.000000       5.800000       3.860000       0.000000   
25%        2287.000000      17.750000      16.880000       0.000000   
50%        2570.000000      31.030000      31.100000       0.000000   
75%        2611.000000      59.100000      56.670000       5.000000   
max        2904.000000     272.280000     293.690000      30.000000   

           promotion     is_holiday    day_of_week  
count  164400.000000  164400.000000  164400.000000  
mean        0.079453       0.013686       3.000000  
std         0.270445       0.116185       2.002286  
min         0.000000       0.000000       0.000000  
25%         0.000000       0.000000       1.000000  
50%     

As shown above, the data is already clean and ready to use.

# EDA (Exploratory Data Analysis)

## 1. Does the presence of holiday affect overall sales and revenue, both daily and monthly?

Get the necessary columns from the preprocessed data.

In [165]:
# Each rows are proven unique by the previous step so we can exclude the ID columns in this section.
# Copy to ensure every change made in this section doesn't affect the main data.
df_1 = df[['date','is_holiday','net_units','net_revenue']].copy()
print(df_1.head())

         date  is_holiday  net_units  net_revenue
0  2024-09-05           0          7       303.32
1  2022-10-24           1          5       432.56
2  2023-04-19           0          2        22.09
3  2024-06-22           0          1       284.06
4  2024-07-20           0          2       113.16


### Holiday Effect Towards Daily Sales and Revenue

Sum sales (units) and revenue of all stores and products each day.

In [166]:
df_1 = df_1.groupby(df_1['date']).sum()
# The above line will sum all data except `date` so `is_holiday` were also summed.
# However, `is_holiday` is conditional data and is more suitable to represent as binary in a day-by-day data.
df_1.loc[df_1['is_holiday'] > 0, 'is_holiday'] = 1
print(df_1.head())

            is_holiday  net_units  net_revenue
date                                          
2022-01-01           0        633     32625.87
2022-01-02           0        636     31625.80
2022-01-03           0        574     30254.72
2022-01-04           0        532     26296.65
2022-01-05           0        568     25624.26


Find the correlation coefficients between variables (`is_holiday`, `net_units`, and `net_revenue`) to analyze the effect of holiday to daily sales and revenue.

In [167]:
corr_1_1 = df_1[['is_holiday', 'net_units', 'net_revenue']].corr()
print(corr_1_1)

             is_holiday  net_units  net_revenue
is_holiday     1.000000   0.169136     0.141445
net_units      0.169136   1.000000     0.919873
net_revenue    0.141445   0.919873     1.000000


Based on the results, the presence of holiday doesn't significantly affect the overall daily sales and revenue.

### Holiday Effect Towards Monthly Sales and Revenue

Every dates in the current data are already unique so we can erase the day in the `date` data to support monthly-based analysis process.

In [168]:
df_1.index = df_1.index.astype(str).str.replace(r'-\d{2}$', '', regex=True)
print(df_1.head())

         is_holiday  net_units  net_revenue
date                                       
2022-01           0        633     32625.87
2022-01           0        636     31625.80
2022-01           0        574     30254.72
2022-01           0        532     26296.65
2022-01           0        568     25624.26


Group (sum) the numeric data based on `date` to earn monthly net sales and revenue.

In [169]:
df_1 = df_1.groupby(df_1.index).sum()
# In this section, `is_holiday` is no longer conditional and is expected to be summed as a representation for total holiday-days in a month.
print(df_1.head())

         is_holiday  net_units  net_revenue
date                                       
2022-01           1      18498    860972.53
2022-02           0      16240    743945.03
2022-03           1      18163    833257.90
2022-04           0      17575    830541.04
2022-05           0      18032    811823.17


Find the correlation coefficients between variables (`is_holiday`, `net_units`, and `net_revenue`).

In [170]:
corr_1_2 = df_1[['is_holiday', 'net_units', 'net_revenue']].corr()
print(corr_1_2)

             is_holiday  net_units  net_revenue
is_holiday     1.000000   0.218785     0.222019
net_units      0.218785   1.000000     0.994426
net_revenue    0.222019   0.994426     1.000000


Based on the results, the presence of holiday doesn't significantly affect the overall monthly sales and revenue. However, the effect shows more than the daily analysis in the previous section.

## 2. Is there any change in product's category trend during no-holiday months and holiday months?

Get the necessary columns from the preprocessed data.

In [171]:
# Each rows are proven unique by the previous step so we can exclude the ID columns in this section.
# Copy to ensure every change made in this section doesn't affect the main data.
df_2 = df[['date','is_holiday','category','net_units']].copy()
print(df_2.head())

         date  is_holiday  category  net_units
0  2024-09-05           0      Home          7
1  2022-10-24           1    Sports          5
2  2023-04-19           0  Clothing          2
3  2024-06-22           0    Beauty          1
4  2024-07-20           0      Home          2


Every dates in the current data are already uniquely paired with each category so we can erase the day in the `date` data to support monthly-based analysis process.

In [172]:
df_2['date'] = df_2['date'].astype(str).str.replace(r'-\d{2}$', '', regex=True)
print(df_2.head())

      date  is_holiday  category  net_units
0  2024-09           0      Home          7
1  2022-10           1    Sports          5
2  2023-04           0  Clothing          2
3  2024-06           0    Beauty          1
4  2024-07           0      Home          2


Count `net_units` by month and product's category while keeping the `is_holiday` properties.

In [173]:
df_2 = df_2.groupby(['date', 'category'], as_index=False)[['is_holiday','net_units']].sum()
# The above line will sum `is_holiday` and `net_units` data based on unique pairs of `date` and `category`.
# However, in this section, we need `is_holiday` as a conditional data so it is more suitable to represent as binary.
df_2.loc[df_2['is_holiday'] > 0, 'is_holiday'] = 1
print(df_2.head())

      date     category  is_holiday  net_units
0  2022-01       Beauty           1       1670
1  2022-01     Clothing           1       6177
2  2022-01  Electronics           1       3936
3  2022-01         Home           1       5372
4  2022-01       Sports           1       1343


Only returns the highest sales product's category for each month.

In [174]:
df_2 = df_2.loc[df_2.groupby('date')['net_units'].idxmax()]
print(df_2.head())

       date  category  is_holiday  net_units
1   2022-01  Clothing           1       6177
6   2022-02  Clothing           0       5343
11  2022-03  Clothing           1       6203
16  2022-04  Clothing           0       6069
21  2022-05  Clothing           0       6079


Only returns the mode of the highest sales product's category across all of the no-holiday months and all of the holiday months.

In [175]:
df_2 = df_2.groupby('is_holiday')['category'].agg(lambda x: x.mode()[0])
print(df_2.head())

is_holiday
0    Clothing
1    Clothing
Name: category, dtype: object


As seen in the two latest dataframes, there's no change in product's category trend during no-holiday months and holiday months. Both product's category trends are clothing.

## 3. Does the weekend status affect overall daily sales and revenue?

Get the necessary columns from the preprocessed data.

In [176]:
# Each rows are proven unique by the previous step so we can exclude the ID columns in this section.
# Copy to ensure every change made in this section doesn't affect the main data.
df_3 = df[['date','weekend','net_units','net_revenue']].copy()
print(df_3.head())

         date  weekend  net_units  net_revenue
0  2024-09-05        0          7       303.32
1  2022-10-24        0          5       432.56
2  2023-04-19        0          2        22.09
3  2024-06-22        1          1       284.06
4  2024-07-20        1          2       113.16


Sum sales (units) and revenue of all stores and products each day.

In [177]:
df_3 = df_3.groupby(df_3['date']).sum()
# The above line will sum all data except `date` so `weekend` were also summed.
# However, `weekend` is conditional data and is more suitable to represent as binary in a day-by-day data.
df_3.loc[df_3['weekend'] > 0, 'weekend'] = 1
print(df_3.head())

            weekend  net_units  net_revenue
date                                       
2022-01-01        1        633     32625.87
2022-01-02        1        636     31625.80
2022-01-03        0        574     30254.72
2022-01-04        0        532     26296.65
2022-01-05        0        568     25624.26


Find the correlation coefficients between variables (`weekend`, `net_units`, and `net_revenue`) to analyze the effect of weekend to daily sales and revenue.

In [186]:
corr_2 = df_3[['weekend', 'net_units', 'net_revenue']].corr()
print(corr_2)

              weekend  net_units  net_revenue
weekend      1.000000   0.511264     0.462136
net_units    0.511264   1.000000     0.919873
net_revenue  0.462136   0.919873     1.000000


Based on the results, the weekend status quite significantly affect the overall daily sales and revenue.

## 4. How is the overall day-by-day sales and revenue trend during a week?

Get the necessary columns from the preprocessed data.

In [184]:
# Each rows are proven unique by the previous step so we can exclude the ID columns in this section.
# Because this section analyze day or `day_of_week` instead of `date`, we can also exclide the `date` column.
# Copy to ensure every change made in this section doesn't affect the main data.
df_4 = df[['day_of_week','net_units','net_revenue']].copy()
print(df_4.head())

   day_of_week  net_units  net_revenue
0            3          7       303.32
1            0          5       432.56
2            2          2        22.09
3            5          1       284.06
4            5          2       113.16


Find the average values of sales and revenue for each day of the week.

In [185]:
df_4 = df_4.groupby(df_4['day_of_week']).mean()
print(df_4)

             net_units  net_revenue
day_of_week                        
0             3.825563   177.054874
1             3.819278   175.540487
2             3.849615   176.279568
3             3.831197   175.841911
4             3.873120   178.832584
5             4.691125   214.380471
6             4.735329   219.188544


The result shows that, during the weekend, the sales and revenue were higher. This shows a consistent result between this section and previous section, the weekend status quite significantly affect the overall daily sales and revenue.

## 5. Does the store type and area affect the customer experiences, which lead to store's sales and revenue?

Get the necessary columns from the preprocessed data.

In [196]:
# Each rows are proven unique by the previous step so we can exclude the `date` and ID columns in this section.
# Copy to ensure every change made in this section doesn't affect the main data.
df_5 = df[['store_type','store_area_sqft','avg_rating','net_units','net_revenue']].copy()
print(df_5.head())

  store_type  store_area_sqft  avg_rating  net_units  net_revenue
0          C             2287        3.41          7       303.32
1          C             2627        3.59          5       432.56
2          B             2547        3.86          2        22.09
3          B             2547        4.71          1       284.06
4          C             2627        3.86          2       113.16


Find the average values of customer experiences, sales, and revenue for each store type and area.

In [197]:
df_5 = df_5.groupby(['store_type', 'store_area_sqft'], as_index=False)[['avg_rating','net_units','net_revenue']].mean()
print(df_5)

  store_type  store_area_sqft  avg_rating  net_units  net_revenue
0          A             1734    3.972809   4.283577   196.241209
1          A             2593    3.964456   4.324088   200.346071
2          A             2596    3.968857   4.059367   186.223064
3          B             2547    3.965058   3.888564   175.711395
4          C             2055    3.968860   4.095438   190.132041
5          C             2287    3.968108   3.859124   178.184290
6          C             2453    3.968673   4.051886   186.245386
7          C             2611    3.970072   3.870073   177.580783
8          C             2627    3.965832   4.363808   202.385544
9          C             2904    3.974178   4.103771   188.853874


### Store Type Effect Towards Average Customer Experiences, Sales, and Revenue

Find the average values of customer experiences, sales, and revenue for each store type only.

In [200]:
df_5_type = df_5.drop(columns='store_area_sqft').copy()
df_5_type = df_5_type.groupby(df_5_type['store_type']).mean()
print(df_5_type)

            avg_rating  net_units  net_revenue
store_type                                    
A             3.968707   4.222344   194.270115
B             3.965058   3.888564   175.711395
C             3.969287   4.057350   187.230320


The result above shows that store type, though it doesn't significantly affect customer experiences, quite significantly affect store's sales and revenue. Store type C has the highest rank, but store type A has the highest net sales and revenue.

### Store Area (in sqft) Effect Towards Average Customer Experiences, Sales, and Revenue

Find the correlation coefficients between `store_area_sqft`, `avg_rating`, `net_units`, and `net_revenue`.

In [201]:
df_5_area = df_5[['store_area_sqft', 'avg_rating', 'net_units', 'net_revenue']].corr()
print(df_5_area)

                 store_area_sqft  avg_rating  net_units  net_revenue
store_area_sqft         1.000000   -0.174085  -0.132697    -0.125751
avg_rating             -0.174085    1.000000  -0.065062    -0.070525
net_units              -0.132697   -0.065062   1.000000     0.990896
net_revenue            -0.125751   -0.070525   0.990896     1.000000


The result above shows that store area doesn't significantly affect either customer experiences, sales, nor revenue. However, it shows a unique correlation which a bigger store area results to a lower rating, sales, and revenue. In addition, surprisingly, customer experiences also doesn't significantly affect either sales nor revenue and is on negative correlation, which means a higher rating results to lower sales and revenue.

## 6. Which category of product is the most popular in each city month-by-month?

Get the necessary columns from the preprocessed data.

In [217]:
# Each rows are proven unique by the previous step so we can exclude the `date` and ID columns in this section.
# Copy to ensure every change made in this section doesn't affect the main data.
df_6 = df[['date','city','category','net_units']].copy()
print(df_6.head())

         date    city  category  net_units
0  2024-09-05  city_9      Home          7
1  2022-10-24  city_2    Sports          5
2  2023-04-19  city_6  Clothing          2
3  2024-06-22  city_6    Beauty          1
4  2024-07-20  city_2      Home          2


Every dates in the current data are already uniquely paired with each category so we can erase the day in the `date` data to support monthly-based analysis process.

In [218]:
df_6['date'] = df_6['date'].astype(str).str.replace(r'-\d{2}$', '', regex=True)
print(df_6)

           date    city  category  net_units
0       2024-09  city_9      Home          7
1       2022-10  city_2    Sports          5
2       2023-04  city_6  Clothing          2
3       2024-06  city_6    Beauty          1
4       2024-07  city_2      Home          2
...         ...     ...       ...        ...
164395  2024-03  city_2    Sports          6
164396  2023-11  city_3      Home          2
164397  2024-05  city_6      Home          4
164398  2024-09  city_2    Sports          1
164399  2024-03  city_1    Beauty          5

[164400 rows x 4 columns]


Count `net_units` by month and product's category while keeping the `city` data.

In [219]:
df_6 = df_6.groupby(['date', 'category','city'], as_index=False)[['net_units']].sum()
# The above line will sum `net_units` data based on unique pairs of `date`, `category`, and 'city'.
print(df_6.head())

      date category     city  net_units
0  2022-01   Beauty   city_1        176
1  2022-01   Beauty  city_10        151
2  2022-01   Beauty   city_2        150
3  2022-01   Beauty   city_3        163
4  2022-01   Beauty   city_4        181


Separate data by city to support city-based analysis process.

In [222]:
df_6_1 = df_6[df_6['city'] == 'city_1'].drop(columns='city')
print('City 1:\n',df_6_1.head())
df_6_2 = df_6[df_6['city'] == 'city_2'].drop(columns='city')
print('City 2:\n',df_6_2.head())
df_6_3 = df_6[df_6['city'] == 'city_3'].drop(columns='city')
print('City 3:\n',df_6_3.head())
df_6_4 = df_6[df_6['city'] == 'city_4'].drop(columns='city')
print('City 4:\n',df_6_4.head())
df_6_5 = df_6[df_6['city'] == 'city_5'].drop(columns='city')
print('City 5:\n',df_6_5.head())
df_6_6 = df_6[df_6['city'] == 'city_6'].drop(columns='city')
print('City 6:\n',df_6_6.head())
df_6_7 = df_6[df_6['city'] == 'city_7'].drop(columns='city')
print('City 7:\n',df_6_7.head())
df_6_8 = df_6[df_6['city'] == 'city_8'].drop(columns='city')
print('City 8:\n',df_6_8.head())
df_6_9 = df_6[df_6['city'] == 'city_9'].drop(columns='city')
print('City 9:\n',df_6_9.head())
df_6_10 = df_6[df_6['city'] == 'city_10'].drop(columns='city')
print('City 10:\n',df_6_10.head())

City 1:
        date     category  net_units
0   2022-01       Beauty        176
10  2022-01     Clothing        647
20  2022-01  Electronics        404
30  2022-01         Home        561
40  2022-01       Sports         92
City 2:
        date     category  net_units
2   2022-01       Beauty        150
12  2022-01     Clothing        678
22  2022-01  Electronics        414
32  2022-01         Home        549
42  2022-01       Sports        147
City 3:
        date     category  net_units
3   2022-01       Beauty        163
13  2022-01     Clothing        536
23  2022-01  Electronics        370
33  2022-01         Home        556
43  2022-01       Sports        121
City 4:
        date     category  net_units
4   2022-01       Beauty        181
14  2022-01     Clothing        613
24  2022-01  Electronics        339
34  2022-01         Home        510
44  2022-01       Sports        154
City 5:
        date     category  net_units
5   2022-01       Beauty        174
15  2022-01     Clo

Returns the highest sales product's category for each pair of month and place.

In [236]:
df_6_1_series = df_6_1.loc[df_6_1.groupby(['date'])['net_units'].idxmax()]
print('City 1:\n',df_6_1_series)
df_6_2_series = df_6_2.loc[df_6_2.groupby(['date'])['net_units'].idxmax()]
print('\nCity 2:\n',df_6_2_series)
df_6_3_series = df_6_3.loc[df_6_3.groupby(['date'])['net_units'].idxmax()]
print('\nCity 3:\n',df_6_3_series)
df_6_4_series = df_6_4.loc[df_6_4.groupby(['date'])['net_units'].idxmax()]
print('\nCity 4:\n',df_6_4_series)
df_6_5_series = df_6_5.loc[df_6_5.groupby(['date'])['net_units'].idxmax()]
print('\nCity 5:\n',df_6_5_series)
df_6_6_series = df_6_6.loc[df_6_6.groupby(['date'])['net_units'].idxmax()]
print('\nCity 6:\n',df_6_6_series)
df_6_7_series = df_6_7.loc[df_6_7.groupby(['date'])['net_units'].idxmax()]
print('\nCity 7:\n',df_6_7_series)
df_6_8_series = df_6_8.loc[df_6_8.groupby(['date'])['net_units'].idxmax()]
print('\nCity 8:\n',df_6_8_series)
df_6_9_series = df_6_9.loc[df_6_9.groupby(['date'])['net_units'].idxmax()]
print('\nCity 9:\n',df_6_9_series)
df_6_10_series = df_6_10.loc[df_6_10.groupby(['date'])['net_units'].idxmax()]
print('\nCity 10:\n',df_6_10_series)

City 1:
          date  category  net_units
10    2022-01  Clothing        647
60    2022-02  Clothing        579
110   2022-03  Clothing        697
160   2022-04  Clothing        625
210   2022-05  Clothing        571
260   2022-06  Clothing        530
310   2022-07  Clothing        531
360   2022-08  Clothing        633
410   2022-09  Clothing        632
460   2022-10  Clothing        639
510   2022-11  Clothing        871
560   2022-12  Clothing        860
610   2023-01  Clothing        661
660   2023-02  Clothing        573
710   2023-03  Clothing        575
760   2023-04  Clothing        609
810   2023-05  Clothing        558
860   2023-06  Clothing        551
910   2023-07  Clothing        588
960   2023-08  Clothing        561
1010  2023-09  Clothing        578
1060  2023-10  Clothing        623
1130  2023-11      Home        719
1180  2023-12      Home        916
1210  2024-01  Clothing        596
1260  2024-02  Clothing        529
1310  2024-03  Clothing        622
1360  2024-

To easen the pattern analyzation process, we can group by continuous segments with the same category as below.

In [237]:
def series_group(df):
    # Ensure the dates are sorted,
    df = df.sort_values('date').reset_index(drop=True)

    # Add column to flag group and group data.
    df['group'] = (df['category'] != df['category'].shift()).cumsum()
    df_group = (
        df.groupby(['group', 'category'])
        .agg(
            start_date=('date', 'first'),
            end_date=('date', 'last'),
            total_units=('net_units', 'sum')
        )
        .reset_index()
    )

    df_group['date'] = df_group['start_date'] + ' - ' + df_group['end_date']
    df_group = df_group[['date', 'category', 'total_units']]
    return df_group

In [238]:
df_6_1_series = series_group(df_6_1_series)
print('City 1:\n',df_6_1_series)
df_6_2_series = series_group(df_6_2_series)
print('City 2:\n',df_6_2_series)
df_6_3_series = series_group(df_6_3_series)
print('City 3:\n',df_6_3_series)
df_6_4_series = series_group(df_6_4_series)
print('City 4:\n',df_6_4_series)
df_6_5_series = series_group(df_6_5_series)
print('City 5:\n',df_6_5_series)
df_6_6_series = series_group(df_6_6_series)
print('City 6:\n',df_6_6_series)
df_6_7_series = series_group(df_6_7_series)
print('City 7:\n',df_6_7_series)
df_6_8_series = series_group(df_6_8_series)
print('City 8:\n',df_6_8_series)
df_6_9_series = series_group(df_6_9_series)
print('City 9:\n',df_6_9_series)
df_6_10_series = series_group(df_6_10_series)
print('City 10:\n',df_6_10_series)

City 1:
                 date  category  total_units
0  2022-01 - 2023-10  Clothing        13692
1  2023-11 - 2023-12      Home         1635
2  2024-01 - 2024-07  Clothing         4166
3  2024-08 - 2024-08      Home          557
4  2024-09 - 2024-12  Clothing         2909
City 2:
                 date  category  total_units
0  2022-01 - 2023-05  Clothing        11722
1  2023-06 - 2023-06      Home          546
2  2023-07 - 2023-08  Clothing         1194
3  2023-09 - 2023-09      Home          660
4  2023-10 - 2024-11  Clothing         9768
5  2024-12 - 2024-12      Home          947
City 3:
                 date  category  total_units
0  2022-01 - 2022-01      Home          556
1  2022-02 - 2022-04  Clothing         1559
2  2022-05 - 2022-06      Home         1018
3  2022-07 - 2023-02  Clothing         4961
4  2023-03 - 2023-03      Home          561
5  2023-04 - 2023-11  Clothing         4745
6  2023-12 - 2023-12      Home          788
7  2024-01 - 2024-07  Clothing         4046
8  20

As shown above, each city varies in trend and interest over the months. However, the top product's category across those months and cities are always whether **Clothing** or **Home**.

## 7. How does discount percentages on products affect store's sales and revenue?

Get the necessary columns from the preprocessed data.

In [254]:
# Each rows are proven unique by the previous step so we can exclude the `date` in this section.
# Discount percentage applies to one specific product and the product is not always on discount.
# Therefore, we need to include product ID data and analyze the effect for each product
# Copy to ensure every change made in this section doesn't affect the main data.
df_7 = df[['product_id','discount_pct','net_units','net_revenue']].copy()
print(df_7.head())

  product_id  discount_pct  net_units  net_revenue
0   prod_031           0.0          7       303.32
1   prod_041          30.0          5       432.56
2   prod_022           0.0          2        22.09
3   prod_037           0.0          1       284.06
4   prod_018           0.0          2       113.16


Separate data by product ID to support product-based analysis process.

In [255]:
# Sort data first for neat visuals.
df_7 = df_7.sort_values('product_id').reset_index(drop=True)

disc_dfs = {}

for prod_id in df_7['product_id'].unique():
    df_disc = df_7[df_7['product_id'] == prod_id].drop(columns='product_id')
    disc_dfs[prod_id] = df_disc

In [256]:
# Output examples.
print('prod_031:\n',disc_dfs['prod_031'].head())
print('prod_041:\n',disc_dfs['prod_041'].head())
print('prod_022:\n',disc_dfs['prod_022'].head())

prod_031:
        discount_pct  net_units  net_revenue
98618           0.0          1        44.42
98619           0.0          5       214.38
98620           0.0          4       173.08
98621           0.0         10       429.38
98622           0.0          4       177.32
prod_041:
         discount_pct  net_units  net_revenue
131632          30.0          4       344.74
131633           0.0          3       364.89
131634           0.0          3       366.98
131635           0.0          2       247.80
131636          15.0          3       314.60
prod_022:
        discount_pct  net_units  net_revenue
68855           0.0          6        66.35
68856           0.0          4        44.94
68857           5.0          1        10.55
68858           0.0          3        35.00
68859           0.0          6        66.67


Find the correlation coefficients between `discount_pct`, `net_units`, and `net_revenue` for each product.

In [266]:
for prod_id, df_disc in disc_dfs.items():
    df_disc = df_disc[['discount_pct', 'net_units', 'net_revenue']].corr()
    print('\n',prod_id,':\n',df_disc)


 prod_001 :
               discount_pct  net_units  net_revenue
discount_pct      1.000000   0.023069    -0.122153
net_units         0.023069   1.000000     0.985899
net_revenue      -0.122153   0.985899     1.000000

 prod_002 :
               discount_pct  net_units  net_revenue
discount_pct      1.000000   0.053468    -0.060733
net_units         0.053468   1.000000     0.989991
net_revenue      -0.060733   0.989991     1.000000

 prod_003 :
               discount_pct  net_units  net_revenue
discount_pct      1.000000   0.020858    -0.111075
net_units         0.020858   1.000000     0.987651
net_revenue      -0.111075   0.987651     1.000000

 prod_004 :
               discount_pct  net_units  net_revenue
discount_pct       1.00000   0.045510    -0.093370
net_units          0.04551   1.000000     0.986788
net_revenue       -0.09337   0.986788     1.000000

 prod_005 :
               discount_pct  net_units  net_revenue
discount_pct      1.000000   0.014563    -0.130000
net_units   

As shown above, discount percentages slightly impact sales and revenue. The higher the discount percentages, the higher the sales is. However, it is inversely proportional to the revenue.

## 8. How does product's promotion affect store's sales and revenue?

Get the necessary columns from the preprocessed data.

In [262]:
# Each rows are proven unique by the previous step so we can exclude the `date` in this section.
# Promotion applies to one specific product and the product is not always on promotion.
# Therefore, we need to include product ID data and analyze the effect for each product
# Copy to ensure every change made in this section doesn't affect the main data.
df_8 = df[['product_id','promotion','net_units','net_revenue']].copy()
print(df_8.head())

  product_id  promotion  net_units  net_revenue
0   prod_031          0          7       303.32
1   prod_041          0          5       432.56
2   prod_022          0          2        22.09
3   prod_037          0          1       284.06
4   prod_018          0          2       113.16


Separate data by product ID to support product-based analysis process.

In [263]:
# Sort data first for neat visuals.
df_8 = df_8.sort_values('product_id').reset_index(drop=True)

promo_dfs = {}

for prod_id in df_8['product_id'].unique():
    df_promo = df_8[df_8['product_id'] == prod_id].drop(columns='product_id')
    promo_dfs[prod_id] = df_promo

In [264]:
# Output examples.
print('prod_031:\n',promo_dfs['prod_031'].head())
print('prod_041:\n',promo_dfs['prod_041'].head())
print('prod_022:\n',promo_dfs['prod_022'].head())

prod_031:
        promotion  net_units  net_revenue
98618          0          1        44.42
98619          0          5       214.38
98620          0          4       173.08
98621          1         10       429.38
98622          0          4       177.32
prod_041:
         promotion  net_units  net_revenue
131632          0          4       344.74
131633          0          3       364.89
131634          0          3       366.98
131635          0          2       247.80
131636          0          3       314.60
prod_022:
        promotion  net_units  net_revenue
68855          0          6        66.35
68856          0          4        44.94
68857          0          1        10.55
68858          0          3        35.00
68859          0          6        66.67


Find the correlation coefficients between `promotion`, `net_units`, and `net_revenue` for each product.

In [267]:
for prod_id, df_promo in promo_dfs.items():
    df_promo = df_promo[['promotion', 'net_units', 'net_revenue']].corr()
    print(prod_id,':\n',df_promo)

prod_001 :
              promotion  net_units  net_revenue
promotion     1.000000   0.335476     0.328968
net_units     0.335476   1.000000     0.985899
net_revenue   0.328968   0.985899     1.000000
prod_002 :
              promotion  net_units  net_revenue
promotion     1.000000   0.302539     0.296924
net_units     0.302539   1.000000     0.989991
net_revenue   0.296924   0.989991     1.000000
prod_003 :
              promotion  net_units  net_revenue
promotion     1.000000   0.270640     0.263367
net_units     0.270640   1.000000     0.987651
net_revenue   0.263367   0.987651     1.000000
prod_004 :
              promotion  net_units  net_revenue
promotion     1.000000   0.284677     0.288223
net_units     0.284677   1.000000     0.986788
net_revenue   0.288223   0.986788     1.000000
prod_005 :
              promotion  net_units  net_revenue
promotion     1.000000   0.294855     0.290646
net_units     0.294855   1.000000     0.986114
net_revenue   0.290646   0.986114     1.000000


As shown above, promotion slightly affects sales and revenue. However, the impact tends to be greater than the impact of discount percentages. Moreover, promotion has directly proportional relations with both sales and revenue. The presence of promotion triggers higher sales and revenue.

## 9. Does the combination of discount and promotion gives different effect to store's sales and revenue?