In [5]:
import pandas as pd

In [6]:
# Loading the dataset
file_path = "E:\\Superstore_Sales\\csv_files\\Superstore_Sales_Dataset.csv"

In [7]:
data = pd.read_csv(file_path)

In [8]:
# Displaying basic infromation about the dataset
print('\n--- Dataset Information: ---')
data.info()


--- Dataset Information: ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 no

In [9]:
# Displaying the first few rows of the dataset
print('\n--- First 5 rows of the dataset: ---')
print(data.head())


--- First 5 rows of the dataset: ---
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   

   Postal Co

In [10]:
# checking for missing values
print('\n--- Missing Values per column: ---')
print(data.isnull().sum())


--- Missing Values per column: ---
Row ID            0
Order ID          0
Order Date        0
Ship Date         0
Ship Mode         0
Customer ID       0
Customer Name     0
Segment           0
Country           0
City              0
State             0
Postal Code      11
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
dtype: int64


In [11]:
# Descriptive statistics of the dataset
print('\n--- Descriptive statistics (Numerical Columns): ---')
# Include datetime columns analysis if needed, requires conversion first
print(data.describe(include='number')) 


--- Descriptive statistics (Numerical Columns): ---
            Row ID   Postal Code         Sales
count  9800.000000   9789.000000   9800.000000
mean   4900.500000  55273.322403    230.769059
std    2829.160653  32041.223413    626.651875
min       1.000000   1040.000000      0.444000
25%    2450.750000  23223.000000     17.248000
50%    4900.500000  58103.000000     54.490000
75%    7350.250000  90008.000000    210.605000
max    9800.000000  99301.000000  22638.480000


In [12]:
print('\n--- Descriptive statistics (Object Columns): ---')
print(data.describe(include='object'))


--- Descriptive statistics (Object Columns): ---
              Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
count             9800        9800        9800            9800        9800   
unique            4922        1230        1326               4         793   
top     CA-2018-100111  05/09/2017  26/09/2018  Standard Class    WB-21850   
freq                14          38          34            5859          35   

        Customer Name   Segment        Country           City       State  \
count            9800      9800           9800           9800        9800   
unique            793         3              1            529          49   
top     William Brown  Consumer  United States  New York City  California   
freq               35      5101           9800            891        1946   

       Region       Product ID         Category Sub-Category     Product Name  
count    9800             9800             9800         9800             9800  
unique      4

In [13]:
print("\n" + "="*50)
print("Initial Business Performance Overview (Raw Data)")
print("="*50)


Initial Business Performance Overview (Raw Data)


In [15]:
# Total Sales
total_sales_raw = data['Sales'].sum()
print(f"\n[PDF Q1] Total Sales (Raw Data): {total_sales_raw:,.2f}")


[PDF Q1] Total Sales (Raw Data): 2,261,536.78


In [16]:
# Total Orders
if 'Order ID' in data.columns:
    total_orders_raw = data['Order ID'].nunique()
    print(f"[PDF Q1] Total Unique Orders (Raw Data): {total_orders_raw}")
else:
    print("[PDF Q1] Cannot calculate unique orders without 'Order ID'. Total rows:", len(data))


[PDF Q1] Total Unique Orders (Raw Data): 4922


In [14]:
# Unique Customers
if 'Customer ID' in data.columns:
    total_customers_raw = data['Customer ID'].nunique()
    print(f"[PDF Q1] Total Unique Customers (Raw Data): {total_customers_raw}")
else:
    print("[PDF Q1] Cannot calculate unique customers without 'Customer ID'.")

[PDF Q1] Total Unique Customers (Raw Data): 793


In [17]:
# Average Order Value
if 'Order ID' in data.columns:
    avg_order_value_raw = total_sales_raw / total_orders_raw if total_orders_raw > 0 else 0
    print(f"[PDF Q1] Average Order Value (Raw Data): {avg_order_value_raw:,.2f}")
else:
     print("[PDF Q1] Cannot reliably calculate Average Order Value without unique 'Order ID'.")
     avg_sale_per_row = data['Sales'].mean()
     print(f"Average Sale per Row (Raw Data): {avg_sale_per_row:,.2f}")

[PDF Q1] Average Order Value (Raw Data): 459.48
