In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:

from google.colab import drive
drive.mount('/content/drive')


In [None]:
dataset_path = '/content/drive/MyDrive/Coffee Shop Sales .csv'

In [None]:
import pandas as pd

df = pd.read_csv(dataset_path)  
df.head()


**🔍 Exploratory Data Analysis (EDA) Checklist**

In [None]:
df.isnull().sum().sum()

In [None]:
df.info()

In [None]:


df.describe()



| Goal               | What to Look For                      |
| ------------------ | ------------------------------------- |
| Best-selling items | Top products by revenue or quantity   |
| Customer behavior  | Average items per transaction         |
| Store performance  | Which store earns more                |
| Time trends        | Best hours, days, or months           |
| Pricing            | Which products are high vs. low price |


**Data Cleaning & Feature Engineering**

In [None]:
df['transaction_date'] = pd.to_datetime(df['transaction_date'], dayfirst=False)
df['transaction_time'] = pd.to_datetime(df['transaction_time'], format='%H:%M:%S').dt.time
df['revenue'] = df['transaction_qty'] * df['unit_price']


df['day'] = df['transaction_date'].dt.day
df['month'] = df['transaction_date'].dt.month
df['weekday'] = df['transaction_date'].dt.day_name()


Univariate Analysis---
Product Category Distribution:

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='product_category', data=df, order=df['product_category'].value_counts().index)
plt.title("Product Category Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
#Revenue Distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['revenue'], bins=30, kde=True)
plt.title("Revenue Distribution")
plt.show()


Bivariate Analysis--- Total Sales by Product Category 

In [None]:
cat_sales = df.groupby('product_category')['revenue'].sum().sort_values(ascending=False)
cat_sales.plot(kind='bar', figsize=(10, 5), title='Total Sales by Category')
plt.ylabel("Revenue")
plt.show()


In [None]:
#Top 5 Store Locations by Revenue
store_sales = df.groupby('store_location')['revenue'].sum().sort_values(ascending=False).head(5)
store_sales.plot(kind='bar', figsize=(10, 5), title='Top 5 Store Locations by Revenue')
plt.ylabel("Revenue")
plt.show()

Time Series Analysis---Daily Revenue Trend

In [None]:
daily_revenue = df.groupby('transaction_date')['revenue'].sum()
daily_revenue.plot(figsize=(12, 5), title="Daily Revenue Trend")
plt.ylabel("Revenue")
plt.show()


In [None]:
# Sales by Day of Week


df['revenue'] = df['transaction_qty'] * df['unit_price']
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['weekday'] = df['transaction_date'].dt.day_name()

weekday_sales = df.groupby('weekday')['revenue'].sum().reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
weekday_sales.plot(kind='bar', figsize=(10, 5), title='Revenue by Day of Week')
plt.ylabel("Revenue")
plt.show()

Top Product---Top 10 Products by Sales

In [1]:
top_products = df.groupby('product_detail')['revenue'].sum().sort_values(ascending=False).head(10)
top_products.plot(kind='bar', figsize=(12, 6), title='Top 10 Best-Selling Products')
plt.ylabel("Revenue")
plt.xticks(rotation=45)
plt.show()


NameError: name 'df' is not defined

**1. Which product category generates the highest revenue?**

In [None]:
##Revenue by Product Category

df['revenue'] = df['transaction_qty'] * df['unit_price']
category_revenue = df.groupby('product_category')['revenue'].sum().sort_values()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_revenue.values, y=category_revenue.index, palette='viridis')
plt.title("Revenue by Product Category")
plt.xlabel("Total Revenue")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()


**Most Profitable Product Categories:**
These are the products category with the highest revenue and these are the best performer.
This means customers are spending the most money on these category.

**2. Which product type is the most sold (by quantity)?**



In [None]:
#Top Selling Product Types (by Quantity)
type_qty = df.groupby('product_type')['transaction_qty'].sum().sort_values()

plt.figure(figsize=(10, 6))
sns.barplot(x=type_qty.values, y=type_qty.index, palette='magma')
plt.title("Top Selling Product Types")
plt.xlabel("Total Quantity Sold")
plt.ylabel("Product Type")
plt.tight_layout()
plt.show()


**Top Selling Product Types:**
These types of products were sold the most in quantity.
Means these type with the highest sales was the most popular among customers.

**3. What are the top 5 best-selling products (by revenue)?**

In [None]:
#Top 5 Products by Revenue
top_products = df.groupby('product_detail')['revenue'].sum().sort_values(ascending=False).head(5)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_products.values, y=top_products.index, palette='coolwarm')
plt.title("Top 5 Products by Revenue")
plt.xlabel("Revenue")
plt.ylabel("Product")
plt.tight_layout()
plt.show()


**Top 5 Products by Revenue:**
These are the top 5 products that earned the most money.
Means these products are the most profitable items in the shop.
They bring in the most income and should be promoted more.

**4. Which store location performs the best in terms of revenue?**

In [None]:
#Revenue by Store Location
location_revenue = df.groupby('store_location')['revenue'].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=location_revenue.values, y=location_revenue.index, palette='cubehelix')
plt.title("Revenue by Store Location")
plt.xlabel("Total Revenue")
plt.ylabel("Store Location")
plt.tight_layout()
plt.show()


**Best Performing Store Location:**
These are the top store location which are doing the best in sales.
Other stores can follow its strategy to improve performance.

**5. What are the peak sales hours in a day?**

In [None]:
#Peak Sales Hours
df['hour'] = pd.to_datetime(df['transaction_time']).dt.hour
sales_by_hour = df.groupby('hour')['transaction_qty'].sum()

plt.figure(figsize=(10, 6))
sns.lineplot(x=sales_by_hour.index, y=sales_by_hour.values, marker='o')
plt.title("Sales by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Quantity Sold")
plt.xticks(range(0, 24))
plt.grid(True)
plt.tight_layout()
plt.show()


**Peak Sales Hours:**
Here, sales are analyzed by hour of the day. Most sales happen during 8-10 hours (the peak hours).
This helps us to understand  when the shop is the busiest and needs more staff.

**6. How does daily sales revenue vary over time?**

In [None]:
#Daily Sales Revenue Trend
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
daily_sales = df.groupby('transaction_date')['revenue'].sum().reset_index()


plt.figure(figsize=(14,6))
sns.lineplot(data=daily_revenue, x='transaction_date', y='Revenue', marker='o')
plt.title("Daily Sales Revenue Over Time", fontsize=16)
plt.xlabel("Date")
plt.ylabel("Total Revenue")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()





**Daily Revenue Trend:**
This chart shows how much revenue the shop makes each day.
2023-06 has the  higher revenue than others.
This helps us plan for busy and slow days.

**What is the most profitable day of the week?**

In [None]:
# Transactions per Day of Week
df['day_of_week'] = df['transaction_date'].dt.day_name()
day_sales = df.groupby('day_of_week')['revenue'].sum().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

plt.figure(figsize=(10, 6))
sns.barplot(x=day_sales.index, y=day_sales.values, palette='plasma')
plt.title("Revenue by Day of Week")
plt.xlabel("Day")
plt.ylabel("Revenue")
plt.tight_layout()
plt.show()


**Sales by Day of the Week:**
Monday  makes the most money  of the week.This helps us plan marketing and promotions better.

**8. How many transactions occurred each day or month?**

In [None]:
daily_transactions = df.groupby('transaction_date')['transaction_id'].nunique()
monthly_transactions = df.groupby(df['transaction_date'].dt.to_period('M'))['transaction_id'].nunique()


In [None]:
#Transaction Quantity Distribution

plt.figure(figsize=(8, 5))
sns.histplot(df['transaction_qty'], bins=20, kde=True, color='steelblue')
plt.title("Distribution of Transaction Quantities")
plt.xlabel("Quantity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


**Transaction Quantity Distribution:**
We checked how many items people usually buy in one transaction.
Most customers buy a small number of items.
This helps us understand customer behavior.

**What is the average revenue per transaction?**

In [None]:
avg_revenue = df.groupby('transaction_id')['revenue'].sum().mean()


In [None]:
#Top 10 Products by Quantity Sold
top_qty = df.groupby('product_detail')['transaction_qty'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_qty.values, y=top_qty.index, palette='crest')
plt.title("Top 10 Products by Quantity Sold")
plt.xlabel("Quantity Sold")
plt.ylabel("Product")
plt.tight_layout()
plt.show()


Top Products by Quantity Sold:
These are the most popular products by quantity.
They should always be in stock and promoted well.

Which products have the highest unit price and are they selling well?


In [None]:

unit_price_stats = df.groupby('product_detail').agg({
    'unit_price': 'mean',
    'transaction_qty': 'sum',
    'Revenue': 'sum'
}).reset_index()

unit_price_stats = unit_price_stats.sort_values(by='unit_price', ascending=False)


plt.figure(figsize=(16, 8))
sns.barplot(
    data=unit_price_stats.head(15),
    x='unit_price',
    y='product_detail',
    hue='transaction_qty',
    palette='viridis'
)

plt.title("Top Products by Unit Price and Their Sales Quantity", fontsize=14)
plt.xlabel("Average Unit Price")
plt.ylabel("Product Detail")
plt.legend(title="Total Transaction Quantity", loc='lower right')
plt.tight_layout()
plt.show()


How is the performance of different stores over time?

In [None]:
store_time_sales = df.groupby(['store_location', 'transaction_date'])['revenue'].sum().reset_index()


In [None]:
#Revenue Trends by Store Over Time
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

store_time_sales = df.groupby(['store_location', 'transaction_date'])['Revenue'].sum().reset_index()

plt.figure(figsize=(16, 8))
sns.lineplot(data=store_time_sales, x='transaction_date', y='Revenue', hue='store_location', marker='o')

plt.title("Store Revenue Trends Over Time", fontsize=16)
plt.xlabel("Date")
plt.ylabel("Total Revenue")
plt.xticks(rotation=45)
plt.legend(title="Store Location", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.grid(True)
plt.show()


In [None]:
#Daily Revenue Intensity by Store
heatmap_data = store_time_sales.pivot(index='transaction_date', columns='store_location', values='Revenue')

plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_data.T, cmap="YlGnBu", linewidths=0.5, linecolor='gray')

plt.title("Daily Revenue Heatmap by Store Location", fontsize=16)
plt.xlabel("Transaction Date")
plt.ylabel("Store Location")
plt.tight_layout()
plt.show()


Which products are rarely sold (low sales volume)?

In [2]:
low_sales = df.groupby('product_detail')['transaction_qty'].sum().sort_values().head(10)
low_sales = low_sales.reset_index()

#lowest-selling products
plt.figure(figsize=(12, 6))
sns.barplot(data=low_sales, x='transaction_qty', y='product_detail', palette='Reds_r')

plt.title("Bottom 10 Low-Selling Products (by Quantity)", fontsize=14)
plt.xlabel("Total Quantity Sold")
plt.ylabel("Product")
plt.tight_layout()
plt.show()

NameError: name 'df' is not defined

In [None]:
!pip install dash plotly

In [None]:
import plotly.express as px
from dash import Dash, dcc, html



df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['hour'] = pd.to_datetime(df['transaction_time']).dt.hour
df['day_of_week'] = df['transaction_date'].dt.day_name()
df['revenue'] = df['transaction_qty'] * df['unit_price']

#Grouped Data
revenue_by_category = df.groupby('product_category')['revenue'].sum().reset_index()
top5_products = df.groupby('product_detail')['revenue'].sum().nlargest(5).reset_index()
revenue_by_location = df.groupby('store_location')['revenue'].sum().reset_index()
sales_by_hour = df.groupby('hour')['transaction_qty'].sum().reset_index()
revenue_by_day = df.groupby('day_of_week')['revenue'].sum().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
).reset_index()


app = Dash(__name__)
app.title = "Coffee Shop Sales Dashboard"

app.layout = html.Div([
    html.H1("☕ Coffee Shop Sales Dashboard", style={'textAlign': 'center'}),

    dcc.Graph(
        figure=px.bar(
            revenue_by_category, x='revenue', y='product_category',
            orientation='h', title="Revenue by Product Category", color='product_category'
        )
    ),

    dcc.Graph(
        figure=px.bar(
            top5_products, x='revenue', y='product_detail',
            orientation='h', title="Top 5 Products by Revenue", color='product_detail'
        )
    ),

    dcc.Graph(
        figure=px.bar(
            revenue_by_location, x='revenue', y='store_location',
            orientation='h', title="Revenue by Store Location", color='store_location'
        )
    ),

    dcc.Graph(
        figure=px.line(
            sales_by_hour, x='hour', y='transaction_qty',
            title="Sales by Hour of Day", markers=True
        )
    ),

    dcc.Graph(
        figure=px.bar(
            revenue_by_day, x='day_of_week', y='revenue',
            title="Revenue by Day of Week", color='day_of_week'
        )
    )
])

if __name__ == '__main__':
    app.run(debug=True)

In [None]:
!pip install dash plotly