### ctrl + p
#python:>create enviroment
#Activating a virtual enviroment
#pip install -r requirements.txt

## Activating Virtual Enviroment on windows
 .venv\Scripts\activate

## Data Pipeline steps
1. Data Extraction
2. Quality Assessment
3. Data Cleaning
4. Data Validation
5. Featue Engineering
6. Loading
7. Analysis and insights

## Import libraries and select the kernel for virtual enviroment

In [50]:
#importing libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine    
from dotenv import load_dotenv
import psycopg2


In [2]:

# Define proper data types for key columns like 'Customer_ID', 'Order_ID', 'Order_Date', 'Product_Category', 'Product_Sub_Category' , etc.
dtypes_dict = {
    'customer_id': 'str',
    'order_id': 'str',
    'order_date': 'str',
    'product_category': 'str',
    'product_sub_category': 'str'
  
   
}

In [13]:
df = pd.read_csv(r'C:\Users\David Ibanga\Data Engineering practicals\Globex_Retail\data\globex_retail_data.csv', dtype=dtypes_dict)

In [14]:
df.head()

Unnamed: 0,Customer_ID,Order_ID,Order_Date,Product_Category,Product_Sub_Category,Quantity,Price,Discount,Customer_Location,Revenue
0,CUST_013738,ORD_00102406,2023-01-01,Home & Garden,Gardening Tools,1,419.19,0.0,TN,419.19
1,CUST_011726,ORD_00102902,2023-01-01,Electronics,Laptops,1,222.37,0.09,TN,202.3567
2,CUST_010891,ORD_00103864,2023-01-01,Electronics,Laptops,6,1107.65,0.0,IN,6645.9
3,CUST_011452,ORD_00103560,2023-01-01,Electronics,Gaming Consoles,5,288.84,0.0,MA,1444.2
4,CUST_010886,ORD_00100632,2023-01-02,Electronics,Headphones,1,191.27,0.0,AZ,191.27


### Data Structure

In [15]:
df.columns

Index(['Customer_ID', 'Order_ID', 'Order_Date', 'Product_Category',
       'Product_Sub_Category', 'Quantity', 'Price', 'Discount',
       'Customer_Location', 'Revenue'],
      dtype='object')

### Step2: Data Quality Assessment checks
## Check for uniqueness, completeness and data consistency
### Checking for missing values

In [16]:
#missing values in the dataset
missing_values = df.isnull().sum()
missing_values

Customer_ID             0
Order_ID                0
Order_Date              0
Product_Category        0
Product_Sub_Category    0
Quantity                0
Price                   0
Discount                0
Customer_Location       0
Revenue                 0
dtype: int64

### Checking for duplicates of data


In [17]:
duplicates = df.duplicated().sum()
duplicates


np.int64(0)

In [18]:
if duplicates > 0:
    duplicated_rows = df[df.duplicated(keep=False)]
    print(duplicated_rows.head())

In [29]:
# Convert Order_Date to datetime
df['Order_Date'] = pd.to_datetime(df['Order_Date'], errors='coerce')
print("\nAfter converting Order_Date to datetime:")
print(df[['Order_Date']].head())



After converting Order_Date to datetime:
  Order_Date
0 2023-01-01
1 2023-01-01
2 2023-01-01
3 2023-01-01
4 2023-01-02


In [21]:
# Engineer Order_Month
df['Order_Month'] = df['Order_Date'].dt.to_period('M').astype(str)
print("\nAfter engineering Order_Month:")
print(df[['Order_Date', 'Order_Month']].head())


After engineering Order_Month:
  Order_Date Order_Month
0 2023-01-01     2023-01
1 2023-01-01     2023-01
2 2023-01-01     2023-01
3 2023-01-01     2023-01
4 2023-01-02     2023-01


In [22]:
# Calculate average order value per customer
customer_avg_order = df.groupby('Customer_ID')['Revenue'].mean().reset_index()
customer_avg_order.columns = ['Customer_ID', 'Avg_Order_Value']
print("\nAverage order value per customer (sample):")
print(customer_avg_order.head())


Average order value per customer (sample):
   Customer_ID  Avg_Order_Value
0  CUST_010001      1124.238500
1  CUST_010002       981.440000
2  CUST_010003       466.543500
3  CUST_010006       452.943867
4  CUST_010007       206.688000


In [31]:
#Validate revenue calculation matches Quantity * Price * (1 - Discount)
df['Calculated_Revenue'] = df['Quantity'] * df['Price'] * (1 - df['Discount'])
revenue_diff = (df['Revenue'] - df['Calculated_Revenue']).abs().mean()
print(f"\n✅ Average Revenue Calculation Difference: {revenue_diff:.4f}")




✅ Average Revenue Calculation Difference: 0.0000


In [32]:
# Engineer Avg_Order_Value per customer
customer_avg_order = df.groupby('Customer_ID')['Revenue'].mean().reset_index()
customer_avg_order.columns = ['Customer_ID', 'Avg_Order_Value']

In [33]:
# Merge back to main df
df = df.merge(customer_avg_order, on='Customer_ID', how='left')

In [34]:
# Create Order_Value column explicitly
df['Order_Value'] = df['Revenue']

print("\n✅ Avg_Order_Value merged. Sample:")
print(df[['Customer_ID', 'Order_Value', 'Avg_Order_Value']].head())



✅ Avg_Order_Value merged. Sample:
   Customer_ID  Order_Value  Avg_Order_Value
0  CUST_013738     419.1900       571.198267
1  CUST_011726     202.3567       202.356700
2  CUST_010891    6645.9000      6645.900000
3  CUST_011452    1444.2000       760.540000
4  CUST_010886     191.2700      1125.855000


In [None]:
#Segment customers into High-Value (top 20% average order value) vs Regular
high_value_threshold = customer_avg_order['Avg_Order_Value'].quantile(0.8)
df['Customer_Segment'] = df['Avg_Order_Value'].apply(
    lambda x: 'High-Value' if x >= high_value_threshold else 'Regular'
)
print(f"\n High-Value customer threshold: {high_value_threshold:.2f}")
print(df[['Customer_ID', 'Avg_Order_Value', 'Customer_Segment']].head())



✅ High-Value customer threshold: 1049.82
   Customer_ID  Avg_Order_Value Customer_Segment
0  CUST_013738       571.198267          Regular
1  CUST_011726       202.356700          Regular
2  CUST_010891      6645.900000       High-Value
3  CUST_011452       760.540000          Regular
4  CUST_010886      1125.855000       High-Value


### Answer insight questions

In [54]:
#Question 1: Product category revenue
category_revenue = df.groupby('Product_Category')['Revenue'].sum().sort_values(ascending=False)
print("\n 1) Revenue by Product Category:")
print(category_revenue)


 1) Revenue by Product Category:
Product_Category
Electronics      2.079574e+06
Home & Garden    6.814707e+05
Sports           4.744529e+05
Clothing         2.819485e+05
Beauty           1.371077e+05
Books            6.522193e+04
Name: Revenue, dtype: float64


In [55]:
#Question 2: Product subcategory revenue
subcategory_revenue = df.groupby('Product_Sub_Category')['Revenue'].sum().sort_values(ascending=False)
print("\n 2) Revenue by Product Subcategory:")
print(subcategory_revenue.head(10))


 2) Revenue by Product Subcategory:
Product_Sub_Category
Smart Watches         403335.4208
Smartphones           365802.8764
Tablets               348740.8234
Gaming Consoles       332296.2353
Headphones            318398.7129
Laptops               310999.7473
Gardening Tools       129829.3451
Kitchen Appliances    125048.8293
Storage               110330.1389
Home Decor            109826.6465
Name: Revenue, dtype: float64


In [56]:
## Question 3: Highest average order value by segment
segment_avg_order = df.groupby('Customer_Segment')['Order_Value'].mean().sort_values(ascending=False)
print("\n 3) Average Order Value by Customer Segment:")
print(segment_avg_order)


 3) Average Order Value by Customer Segment:
Customer_Segment
High-Value    2287.693006
Regular        352.214206
Name: Order_Value, dtype: float64


In [57]:
## Question 4: Average discount applied to high-value customers
high_value_discount = df[df['Customer_Segment'] == 'High-Value']['Discount'].mean()
print(f"\n 4) Average Discount for High-Value Customers: {high_value_discount:.2%}")



 4) Average Discount for High-Value Customers: 4.91%


In [58]:
## Question 5: Geographic locations with most high-value customers
high_value_customers = df[df['Customer_Segment'] == 'High-Value']
location_counts = high_value_customers['Customer_Location'].value_counts()
print("\n 5) High-Value Customer Locations (Top 5):")
print(location_counts.head(5))


 5) High-Value Customer Locations (Top 5):
Customer_Location
CO    48
LA    47
MD    42
IL    41
NC    41
Name: count, dtype: int64


In [59]:
## Question 6: Discount impact by product category
discount_impact = df.groupby('Product_Category').apply(
    lambda x: pd.Series({
        'Avg_Discount': x['Discount'].mean(),
        'Total_Revenue': x['Revenue'].sum()
    })
).sort_values('Total_Revenue', ascending=False)
print("\n 6) Discount Impact by Product Category:")
print(discount_impact)


 6) Discount Impact by Product Category:
                  Avg_Discount  Total_Revenue
Product_Category                             
Electronics           0.058537   2.079574e+06
Home & Garden         0.063398   6.814707e+05
Sports                0.069806   4.744529e+05
Clothing              0.063377   2.819485e+05
Beauty                0.061279   1.371077e+05
Books                 0.065040   6.522193e+04


  discount_impact = df.groupby('Product_Category').apply(


In [60]:
## Question 7: Monthly purchase trends
monthly_trends = df.groupby('Order_Month')['Revenue'].sum().sort_index()
print("\n 7) Monthly Revenue Trends:")
print(monthly_trends)



 7) Monthly Revenue Trends:
Order_Month
2023-01    177437.6013
2023-02    173608.8882
2023-03    151549.1629
2023-04    160107.8262
2023-05    167926.8782
2023-06    134903.8958
2023-07    202245.1529
2023-08    102365.7004
2023-09    153440.8570
2023-10    160409.8015
2023-11    137063.2309
2023-12    154119.1079
2024-01    187767.1758
2024-02    185474.4577
2024-03    156389.5161
2024-04    133587.1201
2024-05    164528.5586
2024-06    140662.3017
2024-07    165830.4983
2024-08    130879.2232
2024-09    147267.8345
2024-10    133072.4755
2024-11    153979.2064
2024-12    145159.1055
Name: Revenue, dtype: float64


#### Recommendations
#### Focus on Electronics: Since Electronics generates the most revenue, ensure optimal inventory levels for tablets, smartphones, and laptops.

#### Target High-Value Customers: Develop loyalty programs for high-value customers, especially in CA, NY, and TX.

#### Strategic Discounting: Use discounts more strategically in categories where they drive significant volume increases (like Clothing), rather than applying them uniformly.

### Seasonal Planning: Prepare inventory and marketing campaigns for peak seasons identified in the monthly trends.

### Subcategory Optimization: Within top categories, focus on the highest-performing subcategories while evaluating underperforming ones.