In [25]:
# Importing packages
import pandas as pd
import streamlit as st 
import plotly_express as px


In [13]:
# Reading the CSV and understanding the basic structure
file_path = '/Users/danfriedman/Dropbox/TripleTen Data Science/Projects/sales-project/2023_sales_data.csv'
df = pd.read_csv(file_path)

df.head(10)

Unnamed: 0,Order ID,Date,Product,Category,Quantity,Price,Total Sales
0,1,2023-05-21,Desk Chair,Furniture,2,120,240
1,2,2023-02-23,Desk Chair,Furniture,4,120,480
2,3,2023-01-28,Backpack,Accessories,8,40,320
3,4,2023-11-15,Desk Chair,Furniture,10,120,1200
4,5,2023-08-12,Monitor,Electronics,2,200,400
5,6,2023-09-25,Smartphone,Electronics,6,600,3600
6,7,2023-12-17,Coffee Maker,Appliances,8,90,720
7,8,2023-12-29,Laptop,Electronics,4,800,3200
8,9,2023-04-07,Tablet,Electronics,10,300,3000
9,10,2023-12-12,Coffee Maker,Appliances,2,90,180


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Order ID     40000 non-null  int64 
 1   Date         40000 non-null  object
 2   Product      40000 non-null  object
 3   Category     40000 non-null  object
 4   Quantity     40000 non-null  int64 
 5   Price        40000 non-null  int64 
 6   Total Sales  40000 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 2.1+ MB


In [15]:
# convert dates to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Order ID     40000 non-null  int64         
 1   Date         40000 non-null  datetime64[ns]
 2   Product      40000 non-null  object        
 3   Category     40000 non-null  object        
 4   Quantity     40000 non-null  int64         
 5   Price        40000 non-null  int64         
 6   Total Sales  40000 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 2.1+ MB


In [16]:
df.describe()

Unnamed: 0,Order ID,Date,Quantity,Price,Total Sales
count,40000.0,40000,40000.0,40000.0,40000.0
mean,20000.5,2023-07-02 13:06:16.560000,5.496525,248.615,1372.803
min,1.0,2023-01-01 00:00:00,1.0,30.0,30.0
25%,10000.75,2023-04-03 00:00:00,3.0,50.0,250.0
50%,20000.5,2023-07-02 00:00:00,6.0,200.0,720.0
75%,30000.25,2023-10-02 00:00:00,8.0,300.0,1800.0
max,40000.0,2024-01-01 00:00:00,10.0,800.0,8000.0
std,11547.14972,,2.865117,246.903778,1702.250357


In [17]:
# extract month
df['Month'] = df['Date'].dt.month
df.head()

Unnamed: 0,Order ID,Date,Product,Category,Quantity,Price,Total Sales,Month
0,1,2023-05-21,Desk Chair,Furniture,2,120,240,5
1,2,2023-02-23,Desk Chair,Furniture,4,120,480,2
2,3,2023-01-28,Backpack,Accessories,8,40,320,1
3,4,2023-11-15,Desk Chair,Furniture,10,120,1200,11
4,5,2023-08-12,Monitor,Electronics,2,200,400,8


In [18]:
twenty_four_records = df[df['Date'].dt.year == 2024]
twenty_four_records

Unnamed: 0,Order ID,Date,Product,Category,Quantity,Price,Total Sales,Month
110,111,2024-01-01,Monitor,Electronics,6,200,1200,1
588,589,2024-01-01,Office Desk,Furniture,10,250,2500,1
721,722,2024-01-01,Backpack,Accessories,10,40,400,1
919,920,2024-01-01,Smartphone,Electronics,3,600,1800,1
1238,1239,2024-01-01,Coffee Maker,Appliances,4,90,360,1
...,...,...,...,...,...,...,...,...
38019,38020,2024-01-01,Monitor,Electronics,8,200,1600,1
38393,38394,2024-01-01,Coffee Maker,Appliances,8,90,720,1
38763,38764,2024-01-01,Office Desk,Furniture,2,250,500,1
38776,38777,2024-01-01,Backpack,Accessories,1,40,40,1


Approximately 100 rows out of 40000 have 2024 as the year. This was an error in generating the file, so I want to remove these rows from the dataframe.

In [19]:
# amend dataframe to remove rows with year of 2024
df = df.drop(twenty_four_records.index)
df.describe()

Unnamed: 0,Order ID,Date,Quantity,Price,Total Sales,Month
count,39897.0,39897,39897.0,39897.0,39897.0,39897.0
mean,20005.227686,2023-07-02 01:47:59.404466432,5.496554,248.653533,1373.086197,6.53004
min,1.0,2023-01-01 00:00:00,1.0,30.0,30.0,1.0
25%,10006.0,2023-04-02 00:00:00,3.0,50.0,250.0,4.0
50%,20009.0,2023-07-02 00:00:00,6.0,200.0,720.0,7.0
75%,30002.0,2023-10-01 00:00:00,8.0,300.0,1800.0,10.0
max,40000.0,2023-12-31 00:00:00,10.0,800.0,8000.0,12.0
std,11546.697517,,2.864643,246.930204,1702.486355,3.446839


The max date of 12-31-2023 confirms the drop method worked to remove the year 2024 rows

In [26]:
# Streamlit app title
st.title("Sales Data Dashboard")

2025-03-25 11:08:44.914 
  command:

    streamlit run /Users/danfriedman/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [27]:
# Checkbox for filtering by Category
show_category = st.checkbox("Filter by Category")
if show_category:
    category_choice = st.selectbox("Select a category:", df["Category"].unique())
    df = df[df["Category"] == category_choice]



In [28]:
# Checkbox for enabling Month/Year filter
show_time_filter = st.checkbox("Filter by Month/Year")
if show_time_filter:
    filter_type = st.radio("Select filter type:", ("Month", "Year"))
    if filter_type == "Month":
        month_choice = st.selectbox("Select a month:", sorted(df["Month"].unique()))
        df = df[df["Month"] == month_choice]
    else:
        year_choice = st.selectbox("Select a year:", sorted(df["Year"].unique()))
        df = df[df["Year"] == year_choice]

