<a href="https://colab.research.google.com/github/cwtausif/Ai-Engineer-Master-Class/blob/main/Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Introductions

In [None]:
# Sample Data
data = [
    {
        "order_id": 1,
        "date": "2026-01-01",
        "time": "08:12",
        "customer_name": "Jessica",
        "item_name": "Latte",
        "price_usd": 5.25,
        "temperature_celsius": 17,
        "is_holiday": False,
        "is_promotion": True,
        "weather": "Cloudy"
    },
    {
        "order_id": 2,
        "date": "2026-01-01",
        "time": "18:05",
        "customer_name": "Robert",
        "item_name": "Americano",
        "price_usd": 300.50,
        "temperature_celsius": 19,
        "is_holiday": False,
        "is_promotion": True,
        "weather": "Rainy"
    },
    {
        "order_id": 3,
        "date": "2026-01-01",
        "time": "19:30",
        "customer_name": "Linda",
        "item_name": "Flat White",
        "price_usd": 4.75,
        "temperature_celsius": 18,
        "is_holiday": False,
        "is_promotion": False,
        "weather": "Rainy"
    },
    {
        "order_id": 4,
        "date": "2026-01-01",
        "time": "18:42",
        "customer_name": "Emily",
        "item_name": "Mocha",
        "price_usd": 5.75,
        "temperature_celsius": 16,
        "is_holiday": False,
        "is_promotion": False,
        "weather": "Foggy"
    },
    {
        "order_id": 5,
        "date": "2026-01-02",
        "time": "09:40",
        "customer_name": "David",
        "item_name": "Cold Brew",
        "price_usd": 4.50,
        "temperature_celsius": 10,
        "is_holiday": False,
        "is_promotion": False,
        "weather": "Cloudy"
    },
    {
        "order_id": 6,
        "date": "2026-01-02",
        "time": "11:52",
        "customer_name": "Anthony",
        "item_name": "Espresso",
        "price_usd": 3.00,
        "temperature_celsius": 11,
        "is_holiday": False,
        "is_promotion": False,
        "weather": "Cloudy"
    },
    {
        "order_id": 7,
        "date": "2026-01-03",
        "time": "14:20",
        "customer_name": "Unknown",
        "item_name": "Chai Latte",
        "price_usd": 4.25,
        "temperature_celsius": 9,
        "is_holiday": False,
        "is_promotion": True,
        "weather": "Humid"
    },
    {
        "order_id": 8,
        "date": "2026-01-03",
        "time": "15:05",
        "customer_name": "Robert",
        "item_name": "Cappuccino",
        "price_usd": 4.95,
        "temperature_celsius": -3,
        "is_holiday": False,
        "is_promotion": True,
        "weather": "Sunny"
    },
    {
        "order_id": 9,
        "date": "2026-01-04",
        "time": "08:58",
        "customer_name": "Karen",
        "item_name": "Hot Chocolate",
        "price_usd": 3.95,
        "temperature_celsius": 22,
        "is_holiday": True,
        "is_promotion": False,
        "weather": "Sunny"
    },
    {
        "order_id": 10,
        "date": "2026-01-04",
        "time": "12:15",
        "customer_name": "Jessica",
        "item_name": "Flat White",
        "price_usd": 0,
        "temperature_celsius": 67,
        "is_holiday": True,
        "is_promotion": False,
        "weather": "Sun"
    }
]

### Load Data in a DataFrame

In [None]:
import pandas as pd

df = pd.DataFrame(data, columns=[
    "order_id",
    "date",
    "time",
    "customer_name",
    "item_name",
    "price_usd",
    "temperature_celsius",
    "is_holiday",
    "is_promotion",
    "weather"
])

df

### **EDA**

## 1. Understand basic structure (rows, columns, dtypes)

In [None]:
# See first few rows
df.head()

In [None]:
# Shape of data
df.shape  # (rows, columns)

In [None]:
# Data types and non-null counts
df.info()

### 2. Check missing values and duplicates

In [None]:
# Missing values per column
df.isnull().sum()

In [None]:
# Percentage of missing values
(df.isnull().mean() * 100).round(2)

In [None]:
# Duplicate rows
df.duplicated().sum()

### 3. Summary statistics for numeric columns

In [None]:
# Summary for numeric columns
df.describe()

If you want to be specific:

In [None]:
df[["price_usd", "temperature_celsius"]].describe()

### 4. Distributions of key columns

In [None]:
df["price_usd"].value_counts()  # for your small sample
# or for bigger data:
df["price_usd"].hist()

### Categorical example: items ordered



In [None]:
df["item_name"].value_counts()

### 5. Simple relationships (groupby)

In [None]:
# Example A: total revenue per day
df.groupby("date")["price_usd"].sum()

In [None]:
# Example B: average order price by item
df.groupby("item_name")["price_usd"].mean()

In [None]:
# Example C: orders on holiday vs non-holiday
df.groupby("is_holiday")["order_id"].count()
# or if you don’t have order_id, use df.shape[0] in groups:
df.groupby("is_holiday").size()

### Plot a simple graph

In [None]:
# Example: Number of orders per item

import matplotlib.pyplot as plt

df["item_name"].value_counts().plot(kind="bar", figsize=(10,5))

plt.title("Orders per Item")
plt.xlabel("Item")
plt.ylabel("Number of Orders")
plt.tight_layout()
plt.show()

### Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlation matrix
corr = df.corr(numeric_only=True)

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

### Price-Only IQR Calculation + Horizontal Boxplot + Outlier Values

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is already loaded and contains price_usd column
# Example:
# df = pd.DataFrame(your_array)

# 1. Extract price column
price = df["price_usd"]

# 2. Calculate Q1, Q3, IQR
Q1 = price.quantile(0.25)
Q3 = price.quantile(0.75)
IQR = Q3 - Q1

# 3. Calculate lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 4. Detect outliers
outliers = df[(price < lower_bound) | (price > upper_bound)]

# 5. Print values
print("Q1:", Q1)
print("Q3:", Q3)
print("IQR:", IQR)
print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("\nOutliers:")
print(outliers[["price_usd"]])

# 6. Horizontal boxplot
plt.figure(figsize=(10, 2))
plt.boxplot(price, vert=False)
plt.title("Price (USD) – IQR & Outlier Detection")
plt.xlabel("Price (USD)")
plt.show()