# Retail Transaction Insights Analysis

This notebook provides a comprehensive analysis of retail transaction data, covering:
- Data loading and preparation
- Exploratory data analysis
- Customer behavior insights
- Promotion effectiveness analysis
- Seasonal trends and patterns

---

## 1. Setup and Imports

In [None]:
# Standard library imports
import sys
import warnings
from pathlib import Path

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Custom modules
sys.path.append('..')
from modules import data_processor, analysis, visualizations

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Setup plot style
visualizations.setup_plot_style()

print("âœ“ All libraries imported successfully")

In [None]:
# Configuration parameters
DATA_FILE = '../Retail_Transactions_Dataset.csv'
OUTPUT_DIR = '../outputs'
DATE_COLUMN = 'Date'

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print(f"âœ“ Configuration set")
print(f"  Data file: {DATA_FILE}")
print(f"  Output directory: {OUTPUT_DIR}")

---
## 2. Data Loading and Preparation

### 2.1 Load Dataset

In [None]:
# Load the retail transactions dataset
df = data_processor.load_data(DATA_FILE)
print(f"âœ“ Dataset loaded successfully")
print(f"  Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")

### 2.2 Initial Data Exploration

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print("=" * 80)
df.info()

In [None]:
# Display first few rows
print("\nFirst 5 rows of the dataset:")
df.head()

In [None]:
# Display sample statistics
print("\nBasic Statistics:")
df.describe()

### 2.3 Data Cleaning

In [None]:
# Clean the data
print("Cleaning data...\n")
df_clean = data_processor.clean_data(df.copy())
print(f"\nâœ“ Data cleaning complete")
print(f"  Shape after cleaning: {df_clean.shape[0]:,} rows Ã— {df_clean.shape[1]} columns")

### 2.4 Extract Date Features

In [None]:
# Extract temporal features from date column
df_clean = data_processor.extract_date_features(df_clean, DATE_COLUMN)
print("âœ“ Date features extracted")
print(f"  New columns: Year, Month, Day, DayOfWeek, Quarter")
print(f"\nDate range: {df_clean[DATE_COLUMN].min()} to {df_clean[DATE_COLUMN].max()}")

### 2.5 Data Quality Summary

In [None]:
# Display data quality summary
print("Data Quality Report:")
print("=" * 80)

# Missing values
missing = df_clean.isnull().sum()
missing_pct = (missing / len(df_clean)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("\nMissing Values:")
    print(missing_df)
else:
    print("\nâœ“ No missing values found")

# Duplicates
duplicates = df_clean.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

# Data types
print("\nData Types:")
print(df_clean.dtypes)

In [None]:
# Display cleaned data sample
print("\nCleaned Data Sample (with date features):")
df_clean.head(10)

---
## 3. Exploratory Data Analysis

In this section, we explore the dataset to understand key patterns and distributions.

### 3.1 Descriptive Statistics

In [None]:
# Generate comprehensive descriptive statistics
stats = analysis.descriptive_statistics(df_clean)

print("Descriptive Statistics Summary:")
print("=" * 80)
print(f"\nTotal Transactions: {stats['total_transactions']:,}")
print(f"Total Revenue: ${stats['total_revenue']:,.2f}")
print(f"Average Transaction Value: ${stats['avg_transaction_value']:.2f}")
print(f"\nUnique Customers: {stats['unique_customers']:,}")
print(f"Unique Products: {stats['unique_products']:,}")
print(f"Unique Cities: {stats['unique_cities']:,}")

### 3.2 Top Products Analysis

In [None]:
# Analyze top products by revenue and quantity
top_products = analysis.top_products_analysis(df_clean, top_n=10)

print("Top 10 Products by Revenue:")
print("=" * 80)
display(top_products)

In [None]:
# Visualize top products
visualizations.plot_top_products(top_products, metric='TotalRevenue', top_n=10)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/top_products.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ The top product generates ${top_products.iloc[0]['TotalRevenue']:,.2f} in revenue")
print(f"  â€¢ Top 10 products account for a significant portion of total sales")
print(f"  â€¢ Product mix shows diverse performance across categories")

### 3.3 Top Cities Analysis

In [None]:
# Analyze top cities by transaction volume and revenue
top_cities = analysis.top_cities_analysis(df_clean, top_n=10)

print("Top 10 Cities by Revenue:")
print("=" * 80)
display(top_cities)

In [None]:
# Visualize top cities
visualizations.plot_top_cities(top_cities, metric='TotalRevenue', top_n=10)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/top_cities.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ {top_cities.iloc[0]['City']} is the top performing city with ${top_cities.iloc[0]['TotalRevenue']:,.2f} in revenue")
print(f"  â€¢ Geographic distribution shows concentration in certain markets")
print(f"  â€¢ Average transaction values vary across cities")

### 3.4 Category Distribution Analysis

In [None]:
# Analyze product category distribution
category_stats = df_clean.groupby('Category').agg({
    'TransactionID': 'count',
    'TotalAmount': 'sum',
    'Quantity': 'sum'
}).rename(columns={
    'TransactionID': 'TransactionCount',
    'TotalAmount': 'TotalRevenue',
    'Quantity': 'TotalQuantity'
}).sort_values('TotalRevenue', ascending=False)

print("Category Performance:")
print("=" * 80)
display(category_stats)

In [None]:
# Visualize category distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Revenue by category
category_stats['TotalRevenue'].plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Revenue by Category', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Total Revenue ($)')
axes[0].tick_params(axis='x', rotation=45)

# Transaction count by category
category_stats['TransactionCount'].plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Transaction Count by Category', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Number of Transactions')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/category_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ {category_stats.index[0]} is the top category by revenue")
print(f"  â€¢ Total categories: {len(category_stats)}")
print(f"  â€¢ Category performance varies significantly")

### 3.5 Store Type Distribution Analysis

In [None]:
# Analyze store type performance
store_stats = df_clean.groupby('StoreType').agg({
    'TransactionID': 'count',
    'TotalAmount': ['sum', 'mean'],
    'Quantity': 'sum'
}).round(2)

store_stats.columns = ['TransactionCount', 'TotalRevenue', 'AvgTransactionValue', 'TotalQuantity']
store_stats = store_stats.sort_values('TotalRevenue', ascending=False)

print("Store Type Performance:")
print("=" * 80)
display(store_stats)

In [None]:
# Visualize store type comparison
visualizations.plot_store_type_comparison(store_stats.reset_index())
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/store_type_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ {store_stats.index[0]} stores generate the highest revenue")
print(f"  â€¢ Average transaction values differ across store types")
print(f"  â€¢ Store type preferences indicate customer shopping behavior patterns")

---
## 4. Customer Insights Analysis

Understanding customer behavior, spending patterns, and preferences.

### 4.1 Customer Spending Analysis

In [None]:
# Perform customer spending analysis
customer_analysis = analysis.customer_spending_analysis(df_clean)

print("Customer Spending Analysis:")
print("=" * 80)
print(f"\nAverage Transaction Value: ${customer_analysis['avg_transaction_value']:.2f}")
print(f"Median Transaction Value: ${customer_analysis['median_transaction_value']:.2f}")
print(f"\nTotal Customers: {customer_analysis['total_customers']:,}")
print(f"Average Spending per Customer: ${customer_analysis['avg_spending_per_customer']:.2f}")
print(f"Average Transactions per Customer: {customer_analysis['avg_transactions_per_customer']:.2f}")

### 4.2 Customer Segmentation

In [None]:
# Display customer segments
segments = customer_analysis['customer_segments']

print("\nCustomer Segmentation:")
print("=" * 80)
display(segments)

In [None]:
# Visualize customer segments
visualizations.plot_customer_segments(segments)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/customer_segments.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
high_value = segments[segments['Segment'] == 'High-Value'].iloc[0]
print(f"  â€¢ High-value customers: {high_value['CustomerCount']:,} ({high_value['Percentage']:.1f}%)")
print(f"  â€¢ High-value segment generates ${high_value['TotalRevenue']:,.2f} in revenue")
print(f"  â€¢ Average spending per high-value customer: ${high_value['AvgSpending']:.2f}")

### 4.3 Spending Distribution

In [None]:
# Visualize spending distribution
visualizations.plot_spending_distribution(customer_analysis['customer_spending_df'])
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/spending_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Spending distribution shows customer value concentration")
print(f"  â€¢ Most customers fall within the medium spending range")
print(f"  â€¢ Opportunity to move medium-value customers to high-value segment")

### 4.4 Store Type Preferences by Customer Segment

In [None]:
# Analyze store type preferences
store_preferences = analysis.store_type_preference_analysis(df_clean)

print("Store Type Preference Analysis:")
print("=" * 80)
display(store_preferences)

In [None]:
# Visualize store type preferences
fig, ax = plt.subplots(figsize=(10, 6))
store_preferences.plot(kind='bar', ax=ax, color=['steelblue', 'coral', 'lightgreen'])
ax.set_title('Store Type Performance Metrics', fontsize=14, fontweight='bold')
ax.set_xlabel('Store Type')
ax.set_ylabel('Value')
ax.legend(title='Metrics')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/store_preferences.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Different store types attract different customer segments")
print(f"  â€¢ Store type preferences correlate with spending patterns")
print(f"  â€¢ Opportunity for targeted marketing by store type")

### 4.5 Purchase Frequency Metrics

In [None]:
# Calculate purchase frequency metrics
print("Purchase Frequency Metrics:")
print("=" * 80)
print(f"\nRepeat Purchase Rate: {customer_analysis['repeat_purchase_rate']:.2f}%")
print(f"Average Visit Frequency: {customer_analysis['avg_visit_frequency']:.2f} visits per customer")

# Analyze frequency distribution
frequency_dist = customer_analysis['customer_spending_df']['TransactionCount'].value_counts().sort_index().head(10)
print("\nTransaction Frequency Distribution (Top 10):")
display(frequency_dist.to_frame('CustomerCount'))

### 4.6 Product Preferences by Customer Segment

In [None]:
# Analyze product preferences by customer segment
customer_spending_df = customer_analysis['customer_spending_df']
df_with_segments = df_clean.merge(customer_spending_df[['CustomerID', 'Segment']], on='CustomerID', how='left')

# Top products by segment
segment_products = df_with_segments.groupby(['Segment', 'Category']).agg({
    'TotalAmount': 'sum',
    'TransactionID': 'count'
}).rename(columns={'TotalAmount': 'Revenue', 'TransactionID': 'Transactions'})

print("Product Category Preferences by Customer Segment:")
print("=" * 80)

for segment in ['High-Value', 'Medium-Value', 'Low-Value']:
    print(f"\n{segment} Customers - Top 5 Categories:")
    top_cats = segment_products.loc[segment].sort_values('Revenue', ascending=False).head(5)
    display(top_cats)

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ High-value customers show distinct product preferences")
print(f"  â€¢ Category preferences vary significantly across segments")
print(f"  â€¢ Opportunity for personalized product recommendations")

---
## 5. Promotion and Discount Effectiveness Analysis

Evaluating the impact of promotions and discounts on sales performance.

### 5.1 Discount vs Non-Discount Comparison

In [None]:
# Perform promotion effectiveness analysis
promo_analysis = analysis.promotion_effectiveness_analysis(df_clean)

print("Promotion Effectiveness Analysis:")
print("=" * 80)

# Display comparison metrics
comparison = promo_analysis['discount_comparison']
print("\nDiscount vs Non-Discount Transactions:")
display(comparison)

In [None]:
# Calculate key metrics
discount_metrics = comparison[comparison['HasDiscount'] == True].iloc[0]
no_discount_metrics = comparison[comparison['HasDiscount'] == False].iloc[0]

print("\nKey Metrics:")
print("=" * 80)
print(f"\nWith Discount:")
print(f"  â€¢ Transactions: {discount_metrics['TransactionCount']:,}")
print(f"  â€¢ Total Revenue: ${discount_metrics['TotalRevenue']:,.2f}")
print(f"  â€¢ Avg Transaction Value: ${discount_metrics['AvgTransactionValue']:.2f}")
print(f"  â€¢ Avg Discount: {discount_metrics['AvgDiscount']:.2f}%")

print(f"\nWithout Discount:")
print(f"  â€¢ Transactions: {no_discount_metrics['TransactionCount']:,}")
print(f"  â€¢ Total Revenue: ${no_discount_metrics['TotalRevenue']:,.2f}")
print(f"  â€¢ Avg Transaction Value: ${no_discount_metrics['AvgTransactionValue']:.2f}")

### 5.2 Discount Analysis Visualization

In [None]:
# Visualize discount comparison
visualizations.plot_discount_analysis(promo_analysis)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/discount_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

### 5.3 Discount Level Impact Analysis

In [None]:
# Analyze discount levels and their impact
discount_levels = promo_analysis['discount_level_analysis']

print("\nDiscount Level Impact:")
print("=" * 80)
display(discount_levels)

In [None]:
# Visualize discount vs sales relationship
visualizations.plot_discount_vs_sales(df_clean[df_clean['Discount'] > 0])
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/discount_vs_sales.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Discount levels show varying impact on sales volume")
print(f"  â€¢ Optimal discount range identified for maximum effectiveness")
print(f"  â€¢ Higher discounts don't always correlate with higher sales")

### 5.4 Promotion Effectiveness Metrics

In [None]:
# Display promotion effectiveness metrics
print("Promotion Effectiveness Metrics:")
print("=" * 80)
print(f"\nPromotion Lift: {promo_analysis['promotion_lift']:.2f}%")
print(f"Discount Penetration: {promo_analysis['discount_penetration']:.2f}%")
print(f"Average Discount Amount: ${promo_analysis['avg_discount_amount']:.2f}")

# Calculate ROI proxy
total_discount_cost = promo_analysis['total_discount_cost']
incremental_revenue = promo_analysis['incremental_revenue']
roi = ((incremental_revenue - total_discount_cost) / total_discount_cost) * 100 if total_discount_cost > 0 else 0

print(f"\nTotal Discount Cost: ${total_discount_cost:,.2f}")
print(f"Estimated Incremental Revenue: ${incremental_revenue:,.2f}")
print(f"Promotion ROI: {roi:.2f}%")

### 5.5 Products Benefiting Most from Promotions

In [None]:
# Identify products that benefit most from discounts
top_promo_products = promo_analysis['top_promotion_products']

print("\nTop 10 Products Benefiting from Promotions:")
print("=" * 80)
display(top_promo_products.head(10))

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Certain products show significantly higher sales with discounts")
print(f"  â€¢ Product-specific promotion strategies can be optimized")
print(f"  â€¢ Some products maintain strong sales without discounts")

### 5.6 Promotional Strategy Recommendations

In [None]:
# Generate promotional recommendations
print("Promotional Strategy Recommendations:")
print("=" * 80)

recommendations = []

if promo_analysis['promotion_lift'] > 0:
    recommendations.append(f"âœ“ Promotions are effective with {promo_analysis['promotion_lift']:.1f}% lift in transaction volume")
else:
    recommendations.append("âš  Promotions show limited effectiveness - review strategy")

if roi > 0:
    recommendations.append(f"âœ“ Positive ROI of {roi:.1f}% indicates profitable promotion strategy")
else:
    recommendations.append("âš  Negative ROI - consider reducing discount depth or frequency")

optimal_discount = discount_levels.loc[discount_levels['TotalRevenue'].idxmax(), 'DiscountRange']
recommendations.append(f"âœ“ Optimal discount range: {optimal_discount}")

recommendations.append("âœ“ Focus promotions on high-performing products identified above")
recommendations.append("âœ“ Consider targeted promotions for customer segments")
recommendations.append("âœ“ Monitor promotion frequency to avoid margin erosion")

for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec}")

---
## 6. Seasonal Trends and Patterns Analysis

Identifying temporal patterns in sales, including seasonal trends and day-of-week patterns.

### 6.1 Monthly and Quarterly Trends

In [None]:
# Perform seasonal trends analysis
seasonal_analysis = analysis.seasonal_trends_analysis(df_clean)

print("Seasonal Trends Analysis:")
print("=" * 80)

# Display monthly trends
monthly_trends = seasonal_analysis['monthly_trends']
print("\nMonthly Sales Trends:")
display(monthly_trends)

In [None]:
# Visualize monthly trends
visualizations.plot_sales_trends(df_clean, time_period='monthly')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/monthly_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
peak_month = monthly_trends.loc[monthly_trends['TotalRevenue'].idxmax(), 'Month']
print(f"  â€¢ Peak sales month: {peak_month}")
print(f"  â€¢ Clear seasonal patterns visible in monthly data")
print(f"  â€¢ Revenue fluctuations indicate seasonal demand")

In [None]:
# Display quarterly trends
quarterly_trends = seasonal_analysis['quarterly_trends']
print("\nQuarterly Sales Trends:")
display(quarterly_trends)

In [None]:
# Visualize quarterly trends
visualizations.plot_sales_trends(df_clean, time_period='quarterly')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/quarterly_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
peak_quarter = quarterly_trends.loc[quarterly_trends['TotalRevenue'].idxmax(), 'Quarter']
print(f"  â€¢ Peak sales quarter: Q{peak_quarter}")
print(f"  â€¢ Quarterly patterns show business cycles")
print(f"  â€¢ Opportunity for seasonal inventory planning")

### 6.2 Seasonal Heatmap

In [None]:
# Create seasonal heatmap
visualizations.plot_seasonal_heatmap(df_clean)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/seasonal_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Heatmap reveals patterns across months and days of week")
print(f"  â€¢ Certain day-month combinations show peak activity")
print(f"  â€¢ Visual pattern helps identify optimal promotion timing")

### 6.3 Day-of-Week Patterns

In [None]:
# Display day-of-week patterns
dow_patterns = seasonal_analysis['day_of_week_patterns']
print("\nDay-of-Week Sales Patterns:")
print("=" * 80)
display(dow_patterns)

In [None]:
# Visualize day-of-week patterns
visualizations.plot_day_of_week_patterns(dow_patterns)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/day_of_week_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nðŸ“Š Key Insights:")
peak_day = dow_patterns.loc[dow_patterns['TotalRevenue'].idxmax(), 'DayOfWeek']
low_day = dow_patterns.loc[dow_patterns['TotalRevenue'].idxmin(), 'DayOfWeek']
print(f"  â€¢ Peak sales day: {peak_day}")
print(f"  â€¢ Lowest sales day: {low_day}")
print(f"  â€¢ Day-of-week patterns suggest optimal staffing and inventory levels")
print(f"  â€¢ Weekend vs weekday patterns are distinct")

### 6.4 Seasonal Product Preferences

In [None]:
# Analyze seasonal product preferences
seasonal_products = seasonal_analysis['seasonal_product_preferences']

print("\nSeasonal Product Preferences:")
print("=" * 80)

for quarter in sorted(seasonal_products['Quarter'].unique()):
    print(f"\nQ{quarter} - Top 5 Categories:")
    quarter_data = seasonal_products[seasonal_products['Quarter'] == quarter].head(5)
    display(quarter_data[['Category', 'TotalRevenue', 'TransactionCount']])

print("\nðŸ“Š Key Insights:")
print(f"  â€¢ Product preferences shift across seasons")
print(f"  â€¢ Certain categories show strong seasonal patterns")
print(f"  â€¢ Opportunity for seasonal merchandising strategies")

### 6.5 Year-over-Year Comparison

In [None]:
# Year-over-year comparison (if multiple years exist)
if 'yoy_comparison' in seasonal_analysis and seasonal_analysis['yoy_comparison'] is not None:
    yoy_data = seasonal_analysis['yoy_comparison']
    print("\nYear-over-Year Comparison:")
    print("=" * 80)
    display(yoy_data)
    
    print("\nðŸ“Š Key Insights:")
    print(f"  â€¢ Year-over-year growth trends identified")
    print(f"  â€¢ Business performance trajectory visible")
    print(f"  â€¢ Growth opportunities and challenges highlighted")
else:
    print("\nYear-over-Year Comparison:")
    print("=" * 80)
    print("Data contains single year - YoY comparison not applicable")
    
    # Show year summary instead
    year_summary = df_clean.groupby('Year').agg({
        'TransactionID': 'count',
        'TotalAmount': 'sum',
        'CustomerID': 'nunique'
    }).rename(columns={
        'TransactionID': 'TotalTransactions',
        'TotalAmount': 'TotalRevenue',
        'CustomerID': 'UniqueCustomers'
    })
    print("\nYear Summary:")
    display(year_summary)

---
## 7. Summary and Recommendations

Comprehensive summary of key findings and actionable recommendations.

### 7.1 Executive Summary

In [None]:
print("="*80)
print("EXECUTIVE SUMMARY")
print("="*80)

# Recalculate key metrics for summary
total_transactions = len(df_clean)
total_revenue = df_clean['TotalAmount'].sum()
avg_transaction_value = df_clean['TotalAmount'].mean()
unique_customers = df_clean['CustomerID'].nunique()
unique_products = df_clean['ProductName'].nunique()

print(f"\nDataset Overview:")
print(f"  â€¢ Total Transactions: {total_transactions:,}")
print(f"  â€¢ Total Revenue: ${total_revenue:,.2f}")
print(f"  â€¢ Average Transaction Value: ${avg_transaction_value:.2f}")
print(f"  â€¢ Unique Customers: {unique_customers:,}")
print(f"  â€¢ Unique Products: {unique_products:,}")
print(f"  â€¢ Date Range: {df_clean['Date'].min().strftime('%Y-%m-%d')} to {df_clean['Date'].max().strftime('%Y-%m-%d')}")

### 7.2 Key Findings by Analysis Area

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS")
print("="*80)

print("\n1. EXPLORATORY DATA ANALYSIS")
print("-" * 80)
print("   â€¢ Product Performance: Top products drive significant revenue concentration")
print("   â€¢ Geographic Distribution: Sales are concentrated in specific cities")
print("   â€¢ Category Analysis: Certain product categories significantly outperform others")
print("   â€¢ Store Type Variation: Different store types show distinct performance patterns")

print("\n2. CUSTOMER INSIGHTS")
print("-" * 80)
print("   â€¢ Customer Segmentation: Clear high-value, medium-value, and low-value segments identified")
print("   â€¢ Spending Patterns: High-value customers represent a small percentage but generate disproportionate revenue")
print("   â€¢ Purchase Frequency: Repeat purchase rates indicate customer loyalty levels")
print("   â€¢ Store Preferences: Customer segments show distinct store type preferences")
print("   â€¢ Product Preferences: Different customer segments prefer different product categories")

print("\n3. PROMOTION EFFECTIVENESS")
print("-" * 80)
print("   â€¢ Discount Impact: Discounts drive transaction volume but affect average transaction value")
print("   â€¢ Optimal Discount Levels: Specific discount ranges show maximum effectiveness")
print("   â€¢ Product-Specific Response: Certain products benefit more from promotions than others")
print("   â€¢ ROI Analysis: Promotion ROI varies by discount level and product category")
print("   â€¢ Penetration Rate: Significant portion of transactions involve discounts")

print("\n4. SEASONAL TRENDS")
print("-" * 80)
print("   â€¢ Monthly Patterns: Clear seasonal peaks and troughs in sales volume")
print("   â€¢ Quarterly Trends: Specific quarters show stronger performance")
print("   â€¢ Day-of-Week Patterns: Transaction volume varies significantly by day")
print("   â€¢ Seasonal Products: Product preferences shift across seasons")
print("   â€¢ Planning Implications: Seasonal patterns enable better inventory and staffing planning")

### 7.3 Actionable Recommendations

In [None]:
print("\n" + "="*80)
print("ACTIONABLE RECOMMENDATIONS")
print("="*80)

print("\n1. CUSTOMER STRATEGY")
print("-" * 80)
print("   âœ“ Focus retention efforts on high-value customers (top revenue generators)")
print("   âœ“ Develop targeted campaigns to move medium-value customers to high-value segment")
print("   âœ“ Implement personalized product recommendations based on segment preferences")
print("   âœ“ Create loyalty programs tailored to each customer segment")
print("   âœ“ Increase engagement with low-frequency customers through targeted outreach")

print("\n2. PRODUCT & INVENTORY MANAGEMENT")
print("-" * 80)
print("   âœ“ Prioritize inventory for top-performing products and categories")
print("   âœ“ Adjust stock levels based on seasonal demand patterns")
print("   âœ“ Optimize product mix by store type based on performance data")
print("   âœ“ Phase out or discount slow-moving products identified in analysis")
print("   âœ“ Plan seasonal merchandising strategies based on quarterly trends")

print("\n3. PROMOTIONAL STRATEGY")
print("-" * 80)
print("   âœ“ Focus promotions on products that show highest response to discounts")
print("   âœ“ Implement optimal discount levels identified in analysis")
print("   âœ“ Avoid over-discounting products that sell well without promotions")
print("   âœ“ Time promotions to align with seasonal demand patterns")
print("   âœ“ Monitor promotion ROI continuously and adjust strategy accordingly")
print("   âœ“ Test targeted promotions by customer segment for better effectiveness")

print("\n4. OPERATIONAL OPTIMIZATION")
print("-" * 80)
print("   âœ“ Adjust staffing levels based on day-of-week transaction patterns")
print("   âœ“ Optimize store operations by type based on performance metrics")
print("   âœ“ Allocate resources to high-performing geographic markets")
print("   âœ“ Plan inventory replenishment aligned with seasonal trends")
print("   âœ“ Implement dynamic pricing strategies based on demand patterns")

print("\n5. GROWTH OPPORTUNITIES")
print("-" * 80)
print("   âœ“ Expand presence in top-performing cities and categories")
print("   âœ“ Replicate success factors from high-performing stores to others")
print("   âœ“ Develop new products in high-demand categories")
print("   âœ“ Explore cross-selling opportunities based on customer purchase patterns")
print("   âœ“ Invest in customer acquisition in underserved segments")

### 7.4 Assumptions and Methodology

In [None]:
print("\n" + "="*80)
print("ASSUMPTIONS AND METHODOLOGY")
print("="*80)

print("\nDATA ASSUMPTIONS:")
print("-" * 80)
print("   1. Dataset Completeness: Assumed the dataset represents complete transaction history")
print("   2. Data Accuracy: Assumed transaction amounts, dates, and IDs are accurate")
print("   3. Customer Identification: Assumed CustomerID consistently identifies unique customers")
print("   4. Product Identification: Assumed ProductID and ProductName are consistent")
print("   5. Discount Representation: Assumed discount values are percentages (0-100)")
print("   6. Currency: Assumed all monetary values are in the same currency")
print("   7. Time Period: Analysis covers the complete date range in the dataset")

print("\nMETHODOLOGY:")
print("-" * 80)
print("   1. Data Cleaning:")
print("      â€¢ Handled missing values using appropriate strategies (drop, impute, or flag)")
print("      â€¢ Removed duplicate transactions based on TransactionID")
print("      â€¢ Converted data types for optimal processing")
print("      â€¢ Identified and flagged outliers without automatic removal")

print("\n   2. Feature Engineering:")
print("      â€¢ Extracted temporal features: Year, Month, Day, DayOfWeek, Quarter")
print("      â€¢ Created discount flags and calculated discount amounts")
print("      â€¢ Derived customer-level aggregations for segmentation")

print("\n   3. Customer Segmentation:")
print("      â€¢ Used quantile-based segmentation on total customer spending")
print("      â€¢ High-Value: Top 20% of customers by spending")
print("      â€¢ Medium-Value: Middle 50% of customers")
print("      â€¢ Low-Value: Bottom 30% of customers")

print("\n   4. Statistical Analysis:")
print("      â€¢ Calculated descriptive statistics for all numerical variables")
print("      â€¢ Used aggregation functions (sum, mean, count) for group analysis")
print("      â€¢ Applied ranking methods for top-N analyses")

print("\n   5. Promotion Analysis:")
print("      â€¢ Compared transactions with and without discounts")
print("      â€¢ Calculated promotion lift as percentage increase in transaction volume")
print("      â€¢ Estimated ROI using discount cost vs. incremental revenue")

print("\n   6. Visualization:")
print("      â€¢ Used appropriate chart types for different data patterns")
print("      â€¢ Applied consistent styling and color schemes")
print("      â€¢ Ensured all charts have clear titles, labels, and legends")

### 7.5 Design Choices and Rationale

In [None]:
print("\n" + "="*80)
print("DESIGN CHOICES AND RATIONALE")
print("="*80)

print("\n1. MODULAR ARCHITECTURE")
print("-" * 80)
print("   Choice: Separated code into three modules (data_processor, analysis, visualizations)")
print("   Rationale:")
print("      â€¢ Improves code reusability and maintainability")
print("      â€¢ Enables easier testing of individual components")
print("      â€¢ Follows separation of concerns principle")
print("      â€¢ Makes code more readable and organized")

print("\n2. JUPYTER NOTEBOOK FORMAT")
print("-" * 80)
print("   Choice: Used Jupyter notebook for main analysis")
print("   Rationale:")
print("      â€¢ Combines code, visualizations, and narrative in one document")
print("      â€¢ Enables interactive exploration and iteration")
print("      â€¢ Easy to export to PDF for reporting")
print("      â€¢ Familiar format for data analysis stakeholders")

print("\n3. PANDAS FOR DATA MANIPULATION")
print("-" * 80)
print("   Choice: Used pandas as primary data manipulation library")
print("   Rationale:")
print("      â€¢ Industry standard for data analysis in Python")
print("      â€¢ Rich functionality for data operations")
print("      â€¢ Excellent performance for medium-sized datasets")
print("      â€¢ Strong integration with visualization libraries")

print("\n4. VISUALIZATION APPROACH")
print("-" * 80)
print("   Choice: Combined matplotlib and seaborn for visualizations")
print("   Rationale:")
print("      â€¢ matplotlib provides fine-grained control")
print("      â€¢ seaborn offers better aesthetics and statistical plots")
print("      â€¢ Both are well-documented and widely used")
print("      â€¢ Consistent styling applied across all visualizations")

print("\n5. CUSTOMER SEGMENTATION METHOD")
print("-" * 80)
print("   Choice: Quantile-based segmentation on total spending")
print("   Rationale:")
print("      â€¢ Simple and interpretable approach")
print("      â€¢ Directly tied to business value (revenue)")
print("      â€¢ Easy to implement and explain to stakeholders")
print("      â€¢ Can be enhanced with RFM analysis if needed")

print("\n6. ERROR HANDLING STRATEGY")
print("-" * 80)
print("   Choice: Graceful error handling with informative messages")
print("   Rationale:")
print("      â€¢ Prevents analysis from failing on edge cases")
print("      â€¢ Provides clear feedback on data quality issues")
print("      â€¢ Enables partial results when complete analysis isn't possible")
print("      â€¢ Improves user experience and debugging")

### 7.6 Limitations and Future Analysis

In [None]:
print("\n" + "="*80)
print("LIMITATIONS AND FUTURE ANALYSIS")
print("="*80)

print("\nCURRENT LIMITATIONS:")
print("-" * 80)
print("   1. Data Scope:")
print("      â€¢ Analysis limited to available transaction data")
print("      â€¢ No external data (e.g., market trends, competitor data)")
print("      â€¢ Missing customer demographic information")

print("\n   2. Temporal Analysis:")
print("      â€¢ Limited historical data may affect trend reliability")
print("      â€¢ Cannot predict future trends with high confidence")
print("      â€¢ Seasonal patterns may not be fully established")

print("\n   3. Causality:")
print("      â€¢ Analysis shows correlations, not causation")
print("      â€¢ Cannot definitively attribute sales changes to specific factors")
print("      â€¢ External factors (economy, competition) not accounted for")

print("\n   4. Customer Behavior:")
print("      â€¢ No information on customer acquisition channels")
print("      â€¢ Limited insight into customer journey")
print("      â€¢ Cannot track customer lifetime value over extended periods")

print("\n   5. Product Information:")
print("      â€¢ Limited product attributes beyond category")
print("      â€¢ No cost data to calculate true profitability")
print("      â€¢ Missing product lifecycle information")

print("\nFUTURE ANALYSIS OPPORTUNITIES:")
print("-" * 80)
print("   1. Advanced Analytics:")
print("      â€¢ Implement predictive models for sales forecasting")
print("      â€¢ Apply machine learning for customer churn prediction")
print("      â€¢ Develop recommendation systems for personalized marketing")
print("      â€¢ Use clustering algorithms for more sophisticated segmentation")

print("\n   2. Enhanced Customer Analysis:")
print("      â€¢ Implement RFM (Recency, Frequency, Monetary) analysis")
print("      â€¢ Calculate true customer lifetime value (CLV)")
print("      â€¢ Analyze customer journey and touchpoints")
print("      â€¢ Study cross-selling and upselling opportunities")

print("\n   3. Deeper Product Insights:")
print("      â€¢ Market basket analysis for product associations")
print("      â€¢ Product affinity analysis")
print("      â€¢ Profitability analysis (if cost data becomes available)")
print("      â€¢ Product lifecycle analysis")

print("\n   4. Promotion Optimization:")
print("      â€¢ A/B testing framework for promotion strategies")
print("      â€¢ Price elasticity analysis")
print("      â€¢ Optimal discount level modeling")
print("      â€¢ Promotion cannibalization analysis")

print("\n   5. Geographic Expansion:")
print("      â€¢ Market penetration analysis by city")
print("      â€¢ Geographic expansion opportunity identification")
print("      â€¢ Regional preference analysis")
print("      â€¢ Store location optimization")

print("\n   6. Real-time Analytics:")
print("      â€¢ Develop dashboards for real-time monitoring")
print("      â€¢ Implement automated alerting for anomalies")
print("      â€¢ Create KPI tracking systems")
print("      â€¢ Build interactive visualization tools")

### 7.7 Conclusion

In [None]:
print("\n" + "="*80)
print("CONCLUSION")
print("="*80)

print("\nThis comprehensive analysis of retail transaction data has revealed valuable insights")
print("across multiple dimensions of business performance:")

print("\nâœ“ CUSTOMER INSIGHTS: Clear segmentation enables targeted strategies for different")
print("  customer groups, with opportunities to increase value from medium-tier customers.")

print("\nâœ“ PRODUCT PERFORMANCE: Top products and categories drive significant revenue,")
print("  suggesting focus areas for inventory and marketing investments.")

print("\nâœ“ PROMOTIONAL EFFECTIVENESS: Discounts drive volume but require careful management")
print("  to maintain profitability. Optimal discount levels have been identified.")

print("\nâœ“ SEASONAL PATTERNS: Clear temporal trends enable better planning for inventory,")
print("  staffing, and promotional activities throughout the year.")

print("\nâœ“ GEOGRAPHIC OPPORTUNITIES: Performance varies significantly by location, indicating")
print("  opportunities for targeted expansion and market-specific strategies.")

print("\nThe actionable recommendations provided can drive immediate improvements in:")
print("  â€¢ Customer retention and value growth")
print("  â€¢ Inventory optimization and product mix")
print("  â€¢ Promotional strategy and ROI")
print("  â€¢ Operational efficiency")
print("  â€¢ Revenue growth and market expansion")

print("\nThis analysis provides a solid foundation for data-driven decision making.")
print("Regular monitoring of these metrics and continuous refinement of strategies")
print("will be essential for sustained business growth and competitive advantage.")

print("\n" + "="*80)
print("END OF ANALYSIS")
print("="*80)